From bd93e69eb4bbd5d75c41a7cf33630cfa35f1f462 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 11 Nov 2020 15:45:32 -0800 Subject: [PATCH] [machinst x64]: implement packed shifts --- cranelift/codegen/src/isa/x64/lower.rs | 446 +++++++++++++++--- .../isa/x64/simd-bitwise-compile.clif | 75 +++ .../filetests/isa/x64/simd-bitwise-run.clif | 108 +++++ 3 files changed, 559 insertions(+), 70 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index f2681c8bb7..8f158a8ea3 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -258,6 +258,58 @@ fn emit_insert_lane>( } } +/// Emit an instruction to extract a lane of `src` into `dst`. +fn emit_extract_lane>( + ctx: &mut C, + src: Reg, + dst: Writable, + lane: u8, + ty: Type, +) { + if !ty.is_float() { + let (sse_op, is64) = match ty.lane_bits() { + 8 => (SseOpcode::Pextrb, false), + 16 => (SseOpcode::Pextrw, false), + 32 => (SseOpcode::Pextrd, false), + 64 => (SseOpcode::Pextrd, true), + _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + } else if ty == types::F32 || ty == types::F64 { + if lane == 0 { + // Remove the extractlane instruction, leaving the float where it is. The upper + // bits will remain unchanged; for correctness, this relies on Cranelift type + // checking to avoid using those bits. + ctx.emit(Inst::gen_move(dst, src, ty)); + } else { + // Otherwise, shuffle the bits in `lane` to the lowest lane. + let sse_op = SseOpcode::Pshufd; + let mask = match ty { + // Move the value at `lane` to lane 0, copying existing value at lane 0 to + // other lanes. Again, this relies on Cranelift type checking to avoid + // using those bits. + types::F32 => { + assert!(lane > 0 && lane < 4); + 0b00_00_00_00 | lane + } + // Move the value at `lane` 1 (we know it must be 1 because of the `if` + // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type + // checking assumption also applies here. + types::F64 => { + assert!(lane == 1); + 0b11_10_11_10 + } + _ => unreachable!(), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + } + } else { + panic!("unable to emit extractlane for type: {}", ty) + } +} + /// Emits an int comparison instruction. /// /// Note: make sure that there are no instructions modifying the flags between a call to this @@ -930,51 +982,338 @@ fn lower_insn_to_regs>( let dst_ty = ctx.output_ty(insn, 0); debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); - let (size, lhs) = match dst_ty { - types::I8 | types::I16 => match op { - Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])), - Opcode::Ushr => ( - 4, - extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), - ), - Opcode::Sshr => ( - 4, - extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32), - ), - Opcode::Rotl | Opcode::Rotr => { + if !dst_ty.is_vector() { + // Scalar shifts on x86 have various encodings: + // - shift by one bit, e.g. `SAL r/m8, 1` (not used here) + // - shift by an immediate amount, e.g. `SAL r/m8, imm8` + // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`. + // This implementation uses the last two encoding methods. + let (size, lhs) = match dst_ty { + types::I8 | types::I16 => match op { + Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])), + Opcode::Ushr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), + ), + Opcode::Sshr => ( + 4, + extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32), + ), + Opcode::Rotl | Opcode::Rotr => { + (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])) + } + _ => unreachable!(), + }, + types::I32 | types::I64 => { (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])) } + _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty), + }; + + let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant { + // Mask count, according to Cranelift's semantics. + let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); + (Some(cst), None) + } else { + (None, Some(put_input_in_reg(ctx, inputs[1]))) + }; + + let dst = get_output_reg(ctx, outputs[0]); + + let shift_kind = match op { + Opcode::Ishl => ShiftKind::ShiftLeft, + Opcode::Ushr => ShiftKind::ShiftRightLogical, + Opcode::Sshr => ShiftKind::ShiftRightArithmetic, + Opcode::Rotl => ShiftKind::RotateLeft, + Opcode::Rotr => ShiftKind::RotateRight, _ => unreachable!(), - }, - types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])), - _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty), - }; + }; - let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant { - // Mask count, according to Cranelift's semantics. - let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); - (Some(cst), None) + let w_rcx = Writable::from_reg(regs::rcx()); + ctx.emit(Inst::mov_r_r(true, lhs, dst)); + if count.is_none() { + ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); + } + ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); + } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) { + // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets + // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea, + // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the + // incorrect bits to 0s (see below for handling signs in `sshr.i8x16`). + let src = put_input_in_reg(ctx, inputs[0]); + let shift_by = input_to_reg_mem_imm(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + // If necessary, move the shift index into the lowest bits of a vector register. + let shift_by_moved = match &shift_by { + RegMemImm::Imm { .. } => shift_by.clone(), + RegMemImm::Reg { reg } => { + let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(*reg), + OperandSize::Size32, + tmp_shift_by, + )); + RegMemImm::reg(tmp_shift_by.to_reg()) + } + RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"), + }; + + // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes; + // the others must be fixed up with the mask below. + let shift_opcode = match op { + Opcode::Ishl => SseOpcode::Psllw, + Opcode::Ushr => SseOpcode::Psrlw, + _ => unimplemented!("{} is not implemented for type {}", op, dst_ty), + }; + ctx.emit(Inst::gen_move(dst, src, dst_ty)); + ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst)); + + // Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix + // up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt + // forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement), + // we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all + // bits except the MSB so we mask with `0x7f`; etc. + const USHR_MASKS: [u8; 128] = [ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + ]; + const SHL_MASKS: [u8; 128] = [ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, + 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, + 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8, + 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0, + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, + 0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, + 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, + 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + ]; + let mask = match op { + Opcode::Ishl => &SHL_MASKS, + Opcode::Ushr => &USHR_MASKS, + _ => unimplemented!("{} is not implemented for type {}", op, dst_ty), + }; + + // Figure out the address of the shift mask. + let mask_address = match shift_by { + RegMemImm::Imm { simm32 } => { + // When the shift amount is known, we can statically (i.e. at compile time) determine the mask to + // use and only emit that. + debug_assert!(simm32 < 8); + let mask_offset = simm32 as usize * 16; + let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown( + &mask[mask_offset..mask_offset + 16], + )); + SyntheticAmode::ConstantOffset(mask_constant) + } + RegMemImm::Reg { reg } => { + // Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct + // mask offset in the table. We do this use LEA to find the base address of the mask table and then + // complex addressing to offset to the right mask: `base_address + shift_by * 4` + let base_mask_address = ctx.alloc_tmp(RegClass::I64, types::I64); + let mask_offset = ctx.alloc_tmp(RegClass::I64, types::I64); + let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask)); + ctx.emit(Inst::lea( + SyntheticAmode::ConstantOffset(mask_constant), + base_mask_address, + )); + ctx.emit(Inst::gen_move(mask_offset, reg, types::I64)); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), mask_offset)); + Amode::imm_reg_reg_shift( + 0, + base_mask_address.to_reg(), + mask_offset.to_reg(), + 0, + ) + .into() + } + RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"), + }; + + // Load the mask into a temporary register, `mask_value`. + let mask_value = ctx.alloc_tmp(RegClass::V128, dst_ty); + ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None)); + + // Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future, + // this AND instruction could be coalesced with the load above. + let sse_op = match dst_ty { + types::F32X4 => SseOpcode::Andps, + types::F64X2 => SseOpcode::Andpd, + _ => SseOpcode::Pand, + }; + ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst)); + } else if dst_ty == types::I8X16 && op == Opcode::Sshr { + // Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for + // `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different + // approach here: separate the low and high lanes, shift them separately, and merge them into the final + // result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]: + // low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] + // shifted_low.i16x8 = shift each lane of `low` + // high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] + // shifted_high.i16x8 = shift each lane of `high` + // dst.i8x16 = [s0'', s1'', ..., s15''] + let src = put_input_in_reg(ctx, inputs[0]); + let shift_by = input_to_reg_mem_imm(ctx, inputs[1]); + let shift_by_ty = ctx.input_ty(insn, 1); + let dst = get_output_reg(ctx, outputs[0]); + + // In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8 + // bits, relying on PSRAW to fill in the upper bits appropriately. + let bigger_shift_by = match shift_by { + // When we know the shift amount at compile time, we add the extra shift amount statically. + RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8), + // Otherwise we add instructions to add the extra shift amount and move the value into an XMM + // register. + RegMemImm::Reg { reg } => { + let bigger_shift_by_gpr = ctx.alloc_tmp(RegClass::I64, shift_by_ty); + ctx.emit(Inst::mov_r_r(true, reg, bigger_shift_by_gpr)); + + let is_64 = shift_by_ty == types::I64; + let imm = RegMemImm::imm(8); + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + imm, + bigger_shift_by_gpr, + )); + + let bigger_shift_by_xmm = ctx.alloc_tmp(RegClass::V128, dst_ty); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::from(bigger_shift_by_gpr), + OperandSize::Size32, + bigger_shift_by_xmm, + )); + RegMemImm::reg(bigger_shift_by_xmm.to_reg()) + } + RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"), + }; + + // Unpack and shift the lower lanes of `src` into the `dst` register. + ctx.emit(Inst::gen_move(dst, src, dst_ty)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst)); + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psraw, + bigger_shift_by.clone(), + dst, + )); + + // Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`. + let upper_lanes = ctx.alloc_tmp(RegClass::V128, dst_ty); + ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Punpckhbw, + RegMem::from(upper_lanes), + upper_lanes, + )); + ctx.emit(Inst::xmm_rmi_reg( + SseOpcode::Psraw, + bigger_shift_by, + upper_lanes, + )); + + // Merge the upper and lower shifted lanes into `dst`. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Packsswb, + RegMem::from(upper_lanes), + dst, + )); + } else if dst_ty == types::I64X2 && op == Opcode::Sshr { + // The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones + // like AVX512VL and AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend + // does not currently have support for EVEX encodings (TODO when EVEX support is available, add an + // alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a + // scalar instruction, and insert the shifted values back in the `dst` XMM register. + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, dst_ty)); + + // Extract the upper and lower lanes into temporary GPRs. + let lower_lane = ctx.alloc_tmp(RegClass::I64, types::I64); + emit_extract_lane(ctx, src, lower_lane, 0, types::I64); + let upper_lane = ctx.alloc_tmp(RegClass::I64, types::I64); + emit_extract_lane(ctx, src, upper_lane, 1, types::I64); + + // Shift each value. + let mut shift = |reg: Writable| { + let kind = ShiftKind::ShiftRightArithmetic; + if let Some(shift_by) = ctx.get_input(insn, 1).constant { + // Mask the shift amount according to Cranelift's semantics. + let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1); + ctx.emit(Inst::shift_r(8, kind, Some(shift_by), reg)); + } else { + let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]); + let w_rcx = Writable::from_reg(regs::rcx()); + ctx.emit(Inst::mov_r_r(true, dynamic_shift_by, w_rcx)); + ctx.emit(Inst::shift_r(8, kind, None, reg)); + }; + }; + shift(lower_lane); + shift(upper_lane); + + // Insert the scalar values back into the `dst` vector. + emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64); + emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64); } else { - (None, Some(put_input_in_reg(ctx, inputs[1]))) - }; + // For the remaining packed shifts not covered above, x86 has implementations that can either: + // - shift using an immediate + // - shift using a dynamic value given in the lower bits of another XMM register. + let src = put_input_in_reg(ctx, inputs[0]); + let shift_by = input_to_reg_mem_imm(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + let sse_op = match dst_ty { + types::I16X8 => match op { + Opcode::Ishl => SseOpcode::Psllw, + Opcode::Ushr => SseOpcode::Psrlw, + Opcode::Sshr => SseOpcode::Psraw, + _ => unimplemented!("{} is not implemented for type {}", op, dst_ty), + }, + types::I32X4 => match op { + Opcode::Ishl => SseOpcode::Pslld, + Opcode::Ushr => SseOpcode::Psrld, + Opcode::Sshr => SseOpcode::Psrad, + _ => unimplemented!("{} is not implemented for type {}", op, dst_ty), + }, + types::I64X2 => match op { + Opcode::Ishl => SseOpcode::Psllq, + Opcode::Ushr => SseOpcode::Psrlq, + _ => unimplemented!("{} is not implemented for type {}", op, dst_ty), + }, + _ => unreachable!(), + }; - let dst = get_output_reg(ctx, outputs[0]); + // If necessary, move the shift index into the lowest bits of a vector register. + let shift_by = match shift_by { + RegMemImm::Imm { .. } => shift_by, + RegMemImm::Reg { reg } => { + let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty); + ctx.emit(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(reg), + OperandSize::Size32, + tmp_shift_by, + )); + RegMemImm::reg(tmp_shift_by.to_reg()) + } + RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"), + }; - let shift_kind = match op { - Opcode::Ishl => ShiftKind::ShiftLeft, - Opcode::Ushr => ShiftKind::ShiftRightLogical, - Opcode::Sshr => ShiftKind::ShiftRightArithmetic, - Opcode::Rotl => ShiftKind::RotateLeft, - Opcode::Rotr => ShiftKind::RotateRight, - _ => unreachable!(), - }; + // Move the `src` to the same register as `dst`. + ctx.emit(Inst::gen_move(dst, src, dst_ty)); - let w_rcx = Writable::from_reg(regs::rcx()); - ctx.emit(Inst::mov_r_r(true, lhs, dst)); - if count.is_none() { - ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); + ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst)); } - ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); } Opcode::Ineg => { @@ -3329,40 +3668,7 @@ fn lower_insn_to_regs>( }; debug_assert!(lane < src_ty.lane_count() as u8); - if !ty.is_float() { - let (sse_op, w_bit) = match ty.lane_bits() { - 8 => (SseOpcode::Pextrb, false), - 16 => (SseOpcode::Pextrw, false), - 32 => (SseOpcode::Pextrd, false), - 64 => (SseOpcode::Pextrd, true), - _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), - }; - let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); - } else { - if lane == 0 { - // Remove the extractlane instruction, leaving the float where it is. The upper - // bits will remain unchanged; for correctness, this relies on Cranelift type - // checking to avoid using those bits. - ctx.emit(Inst::gen_move(dst, src, ty)); - } else { - // Otherwise, shuffle the bits in `lane` to the lowest lane. - let sse_op = SseOpcode::Pshufd; - let mask = match src_ty { - // Move the value at `lane` to lane 0, copying existing value at lane 0 to - // other lanes. Again, this relies on Cranelift type checking to avoid - // using those bits. - types::F32X4 => 0b00_00_00_00 | lane, - // Move the value at `lane` 1 (we know it must be 1 because of the `if` - // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type - // checking assumption also applies here. - types::F64X2 => 0b11_10_11_10, - _ => unreachable!(), - }; - let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); - } - } + emit_extract_lane(ctx, src, dst, lane, ty); } Opcode::Splat | Opcode::LoadSplat => { diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index fe94b84548..fe52e3c503 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -15,3 +15,78 @@ block0: ; nextln: pandn %xmm2, %xmm0 ; nextln: por %xmm1, %xmm0 ; not: movdqa + + + +; 8x16 shifts: these lower to complex sequences of instructions + +function %ishl_i8x16(i32) -> i8x16 { +block0(v0: i32): + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ishl v1, v0 + return v2 +} +; check: movd %edi, %xmm1 +; nextln: psllw %xmm1, %xmm0 +; nextln: lea const(VCodeConstant(0)), %r12 +; nextln: shlq $$4, %rdi +; nextln: movdqu 0(%r12,%rdi,1), %xmm1 +; nextln: pand %xmm1, %xmm0 + +function %ushr_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ushr v1, v0 + return v2 +} +; check: load_const VCodeConstant(1), %xmm0 +; nextln: psrlw $$1, %xmm0 +; nextln: movdqu const(VCodeConstant(0)), %xmm1 +; nextln: pand %xmm1, %xmm0 + +function %sshr_i8x16(i32) -> i8x16 { +block0(v0: i32): + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = sshr v1, v0 + return v2 +} +; check: addl $$8, %edi +; nextln: movd %edi, %xmm2 +; nextln: movdqa %xmm0, %xmm1 +; nextln: punpcklbw %xmm1, %xmm1 +; nextln: psraw %xmm2, %xmm1 +; nextln: punpckhbw %xmm0, %xmm0 +; nextln: psraw %xmm2, %xmm0 + +function %sshr_i8x16_imm(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = sshr_imm v0, 3 + return v2 +} +; check: movdqa %xmm0, %xmm1 +; nextln: movdqa %xmm1, %xmm0 +; nextln: punpcklbw %xmm0, %xmm0 +; nextln: psraw $$11, %xmm0 +; nextln: punpckhbw %xmm1, %xmm1 +; nextln: psraw $$11, %xmm1 +; nextln: packsswb %xmm1, %xmm0 + + + +; i16x4 arithmetic shifts: x86 does not have a instruction for this + +function %sshr_i64x2(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; check: pextrd.w $$0, %xmm0, %r12 +; nextln: pextrd.w $$1, %xmm0, %r13 +; nextln: movq %rdi, %rcx +; nextln: sarq %cl, %r12 +; nextln: movq %rdi, %rcx +; nextln: sarq %cl, %r13 +; nextln: pinsrd.w $$0, %r12, %xmm1 +; nextln: pinsrd.w $$1, %r13, %xmm1 +; nextln: movdqa %xmm1, %xmm0 diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif index 3f1c814a2c..8ab624d6c2 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif @@ -10,3 +10,111 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16): } ; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. ; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] + + + +; shift left + +function %ishl_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0] + +function %ishl_i16x8(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0] + +function %ishl_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16] + +function %ishl_imm_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v2 = ishl_imm v0, 1 + return v2 +} +; run: %ishl_imm_i64x2([1 0]) == [2 0] + + + +; shift right (logical) + +function %ushr_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + +function %ushr_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0] + +function %ushr_i64x2(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i64x2([1 2], 1) == [0 1] + + + +; shift right (arithmetic) + +function %sshr_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] + +function %sshr_i16x8(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8 +; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64] + +function %sshr_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff) +; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff] + +function %sshr_i64x2(i64x2, i32) -> i64x2 { +block0(v0:i64x2, v1:i32): + v2 = sshr v0, v1 + return v2 +} +; run: %sshr_i64x2([1 -1], 0) == [1 -1] +; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result +; run: %sshr_i64x2([2 -2], 1) == [1 -1] +; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0] + +function %sshr_imm_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = sshr_imm v0, 1 + return v1 +} +; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4] + +function %sshr_imm_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = sshr_imm v0, 1 + return v1 +} +; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]