[machinst x64]: implement packed shifts
This commit is contained in:
@@ -258,6 +258,58 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Emit an instruction to extract a lane of `src` into `dst`.
|
||||||
|
fn emit_extract_lane<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
src: Reg,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
lane: u8,
|
||||||
|
ty: Type,
|
||||||
|
) {
|
||||||
|
if !ty.is_float() {
|
||||||
|
let (sse_op, is64) = match ty.lane_bits() {
|
||||||
|
8 => (SseOpcode::Pextrb, false),
|
||||||
|
16 => (SseOpcode::Pextrw, false),
|
||||||
|
32 => (SseOpcode::Pextrd, false),
|
||||||
|
64 => (SseOpcode::Pextrd, true),
|
||||||
|
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
|
||||||
|
};
|
||||||
|
let src = RegMem::reg(src);
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
|
||||||
|
} else if ty == types::F32 || ty == types::F64 {
|
||||||
|
if lane == 0 {
|
||||||
|
// Remove the extractlane instruction, leaving the float where it is. The upper
|
||||||
|
// bits will remain unchanged; for correctness, this relies on Cranelift type
|
||||||
|
// checking to avoid using those bits.
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
||||||
|
} else {
|
||||||
|
// Otherwise, shuffle the bits in `lane` to the lowest lane.
|
||||||
|
let sse_op = SseOpcode::Pshufd;
|
||||||
|
let mask = match ty {
|
||||||
|
// Move the value at `lane` to lane 0, copying existing value at lane 0 to
|
||||||
|
// other lanes. Again, this relies on Cranelift type checking to avoid
|
||||||
|
// using those bits.
|
||||||
|
types::F32 => {
|
||||||
|
assert!(lane > 0 && lane < 4);
|
||||||
|
0b00_00_00_00 | lane
|
||||||
|
}
|
||||||
|
// Move the value at `lane` 1 (we know it must be 1 because of the `if`
|
||||||
|
// statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
|
||||||
|
// checking assumption also applies here.
|
||||||
|
types::F64 => {
|
||||||
|
assert!(lane == 1);
|
||||||
|
0b11_10_11_10
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
let src = RegMem::reg(src);
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
panic!("unable to emit extractlane for type: {}", ty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Emits an int comparison instruction.
|
/// Emits an int comparison instruction.
|
||||||
///
|
///
|
||||||
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
||||||
@@ -930,6 +982,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst_ty = ctx.output_ty(insn, 0);
|
let dst_ty = ctx.output_ty(insn, 0);
|
||||||
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
||||||
|
|
||||||
|
if !dst_ty.is_vector() {
|
||||||
|
// Scalar shifts on x86 have various encodings:
|
||||||
|
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
|
||||||
|
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
|
||||||
|
// - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
|
||||||
|
// This implementation uses the last two encoding methods.
|
||||||
let (size, lhs) = match dst_ty {
|
let (size, lhs) = match dst_ty {
|
||||||
types::I8 | types::I16 => match op {
|
types::I8 | types::I16 => match op {
|
||||||
Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
|
Opcode::Ishl => (4, put_input_in_reg(ctx, inputs[0])),
|
||||||
@@ -946,7 +1004,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
},
|
},
|
||||||
types::I32 | types::I64 => (dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0])),
|
types::I32 | types::I64 => {
|
||||||
|
(dst_ty.bytes() as u8, put_input_in_reg(ctx, inputs[0]))
|
||||||
|
}
|
||||||
_ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
|
_ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -975,6 +1035,285 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
|
ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
|
||||||
}
|
}
|
||||||
ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
|
ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
|
||||||
|
} else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
|
||||||
|
// Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
|
||||||
|
// like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
|
||||||
|
// whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
|
||||||
|
// incorrect bits to 0s (see below for handling signs in `sshr.i8x16`).
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
|
// If necessary, move the shift index into the lowest bits of a vector register.
|
||||||
|
let shift_by_moved = match &shift_by {
|
||||||
|
RegMemImm::Imm { .. } => shift_by.clone(),
|
||||||
|
RegMemImm::Reg { reg } => {
|
||||||
|
let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
|
||||||
|
ctx.emit(Inst::gpr_to_xmm(
|
||||||
|
SseOpcode::Movd,
|
||||||
|
RegMem::reg(*reg),
|
||||||
|
OperandSize::Size32,
|
||||||
|
tmp_shift_by,
|
||||||
|
));
|
||||||
|
RegMemImm::reg(tmp_shift_by.to_reg())
|
||||||
|
}
|
||||||
|
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
|
||||||
|
// the others must be fixed up with the mask below.
|
||||||
|
let shift_opcode = match op {
|
||||||
|
Opcode::Ishl => SseOpcode::Psllw,
|
||||||
|
Opcode::Ushr => SseOpcode::Psrlw,
|
||||||
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
|
};
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
||||||
|
ctx.emit(Inst::xmm_rmi_reg(shift_opcode, shift_by_moved, dst));
|
||||||
|
|
||||||
|
// Choose which mask to use to fixup the shifted lanes. Since we must use a 16x8 shift, we need to fix
|
||||||
|
// up the bits that migrate from one half of the lane to the other. Each 16-byte mask (which rustfmt
|
||||||
|
// forces to multiple lines) is indexed by the shift amount: e.g. if we shift right by 0 (no movement),
|
||||||
|
// we want to retain all the bits so we mask with `0xff`; if we shift right by 1, we want to retain all
|
||||||
|
// bits except the MSB so we mask with `0x7f`; etc.
|
||||||
|
const USHR_MASKS: [u8; 128] = [
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xff, 0xff, 0xff, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
|
||||||
|
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
|
||||||
|
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x1f, 0x1f, 0x1f, 0x1f,
|
||||||
|
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f,
|
||||||
|
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
|
||||||
|
0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
|
||||||
|
0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||||
|
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
];
|
||||||
|
const SHL_MASKS: [u8; 128] = [
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
|
||||||
|
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
|
||||||
|
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
|
||||||
|
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
|
||||||
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
|
||||||
|
0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
|
||||||
|
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
|
||||||
|
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
];
|
||||||
|
let mask = match op {
|
||||||
|
Opcode::Ishl => &SHL_MASKS,
|
||||||
|
Opcode::Ushr => &USHR_MASKS,
|
||||||
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Figure out the address of the shift mask.
|
||||||
|
let mask_address = match shift_by {
|
||||||
|
RegMemImm::Imm { simm32 } => {
|
||||||
|
// When the shift amount is known, we can statically (i.e. at compile time) determine the mask to
|
||||||
|
// use and only emit that.
|
||||||
|
debug_assert!(simm32 < 8);
|
||||||
|
let mask_offset = simm32 as usize * 16;
|
||||||
|
let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(
|
||||||
|
&mask[mask_offset..mask_offset + 16],
|
||||||
|
));
|
||||||
|
SyntheticAmode::ConstantOffset(mask_constant)
|
||||||
|
}
|
||||||
|
RegMemImm::Reg { reg } => {
|
||||||
|
// Otherwise, we must emit the entire mask table and dynamically (i.e. at run time) find the correct
|
||||||
|
// mask offset in the table. We do this use LEA to find the base address of the mask table and then
|
||||||
|
// complex addressing to offset to the right mask: `base_address + shift_by * 4`
|
||||||
|
let base_mask_address = ctx.alloc_tmp(RegClass::I64, types::I64);
|
||||||
|
let mask_offset = ctx.alloc_tmp(RegClass::I64, types::I64);
|
||||||
|
let mask_constant = ctx.use_constant(VCodeConstantData::WellKnown(mask));
|
||||||
|
ctx.emit(Inst::lea(
|
||||||
|
SyntheticAmode::ConstantOffset(mask_constant),
|
||||||
|
base_mask_address,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::gen_move(mask_offset, reg, types::I64));
|
||||||
|
ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), mask_offset));
|
||||||
|
Amode::imm_reg_reg_shift(
|
||||||
|
0,
|
||||||
|
base_mask_address.to_reg(),
|
||||||
|
mask_offset.to_reg(),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
RegMemImm::Mem { addr: _ } => unimplemented!("load mask address"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load the mask into a temporary register, `mask_value`.
|
||||||
|
let mask_value = ctx.alloc_tmp(RegClass::V128, dst_ty);
|
||||||
|
ctx.emit(Inst::load(dst_ty, mask_address, mask_value, ExtKind::None));
|
||||||
|
|
||||||
|
// Remove the bits that would have disappeared in a true 8x16 shift. TODO in the future,
|
||||||
|
// this AND instruction could be coalesced with the load above.
|
||||||
|
let sse_op = match dst_ty {
|
||||||
|
types::F32X4 => SseOpcode::Andps,
|
||||||
|
types::F64X2 => SseOpcode::Andpd,
|
||||||
|
_ => SseOpcode::Pand,
|
||||||
|
};
|
||||||
|
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
|
||||||
|
} else if dst_ty == types::I8X16 && op == Opcode::Sshr {
|
||||||
|
// Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
|
||||||
|
// `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
|
||||||
|
// approach here: separate the low and high lanes, shift them separately, and merge them into the final
|
||||||
|
// result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
|
||||||
|
// low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
|
||||||
|
// shifted_low.i16x8 = shift each lane of `low`
|
||||||
|
// high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
||||||
|
// shifted_high.i16x8 = shift each lane of `high`
|
||||||
|
// dst.i8x16 = [s0'', s1'', ..., s15'']
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
|
||||||
|
let shift_by_ty = ctx.input_ty(insn, 1);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
|
// In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
|
||||||
|
// bits, relying on PSRAW to fill in the upper bits appropriately.
|
||||||
|
let bigger_shift_by = match shift_by {
|
||||||
|
// When we know the shift amount at compile time, we add the extra shift amount statically.
|
||||||
|
RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
|
||||||
|
// Otherwise we add instructions to add the extra shift amount and move the value into an XMM
|
||||||
|
// register.
|
||||||
|
RegMemImm::Reg { reg } => {
|
||||||
|
let bigger_shift_by_gpr = ctx.alloc_tmp(RegClass::I64, shift_by_ty);
|
||||||
|
ctx.emit(Inst::mov_r_r(true, reg, bigger_shift_by_gpr));
|
||||||
|
|
||||||
|
let is_64 = shift_by_ty == types::I64;
|
||||||
|
let imm = RegMemImm::imm(8);
|
||||||
|
ctx.emit(Inst::alu_rmi_r(
|
||||||
|
is_64,
|
||||||
|
AluRmiROpcode::Add,
|
||||||
|
imm,
|
||||||
|
bigger_shift_by_gpr,
|
||||||
|
));
|
||||||
|
|
||||||
|
let bigger_shift_by_xmm = ctx.alloc_tmp(RegClass::V128, dst_ty);
|
||||||
|
ctx.emit(Inst::gpr_to_xmm(
|
||||||
|
SseOpcode::Movd,
|
||||||
|
RegMem::from(bigger_shift_by_gpr),
|
||||||
|
OperandSize::Size32,
|
||||||
|
bigger_shift_by_xmm,
|
||||||
|
));
|
||||||
|
RegMemImm::reg(bigger_shift_by_xmm.to_reg())
|
||||||
|
}
|
||||||
|
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Unpack and shift the lower lanes of `src` into the `dst` register.
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
|
||||||
|
ctx.emit(Inst::xmm_rmi_reg(
|
||||||
|
SseOpcode::Psraw,
|
||||||
|
bigger_shift_by.clone(),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
|
||||||
|
let upper_lanes = ctx.alloc_tmp(RegClass::V128, dst_ty);
|
||||||
|
ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Punpckhbw,
|
||||||
|
RegMem::from(upper_lanes),
|
||||||
|
upper_lanes,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rmi_reg(
|
||||||
|
SseOpcode::Psraw,
|
||||||
|
bigger_shift_by,
|
||||||
|
upper_lanes,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Merge the upper and lower shifted lanes into `dst`.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Packsswb,
|
||||||
|
RegMem::from(upper_lanes),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
} else if dst_ty == types::I64X2 && op == Opcode::Sshr {
|
||||||
|
// The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
|
||||||
|
// like AVX512VL and AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
|
||||||
|
// does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
|
||||||
|
// alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
|
||||||
|
// scalar instruction, and insert the shifted values back in the `dst` XMM register.
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
||||||
|
|
||||||
|
// Extract the upper and lower lanes into temporary GPRs.
|
||||||
|
let lower_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
|
||||||
|
emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
|
||||||
|
let upper_lane = ctx.alloc_tmp(RegClass::I64, types::I64);
|
||||||
|
emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
|
||||||
|
|
||||||
|
// Shift each value.
|
||||||
|
let mut shift = |reg: Writable<Reg>| {
|
||||||
|
let kind = ShiftKind::ShiftRightArithmetic;
|
||||||
|
if let Some(shift_by) = ctx.get_input(insn, 1).constant {
|
||||||
|
// Mask the shift amount according to Cranelift's semantics.
|
||||||
|
let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
|
||||||
|
ctx.emit(Inst::shift_r(8, kind, Some(shift_by), reg));
|
||||||
|
} else {
|
||||||
|
let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
|
||||||
|
let w_rcx = Writable::from_reg(regs::rcx());
|
||||||
|
ctx.emit(Inst::mov_r_r(true, dynamic_shift_by, w_rcx));
|
||||||
|
ctx.emit(Inst::shift_r(8, kind, None, reg));
|
||||||
|
};
|
||||||
|
};
|
||||||
|
shift(lower_lane);
|
||||||
|
shift(upper_lane);
|
||||||
|
|
||||||
|
// Insert the scalar values back into the `dst` vector.
|
||||||
|
emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
|
||||||
|
emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
|
||||||
|
} else {
|
||||||
|
// For the remaining packed shifts not covered above, x86 has implementations that can either:
|
||||||
|
// - shift using an immediate
|
||||||
|
// - shift using a dynamic value given in the lower bits of another XMM register.
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
let sse_op = match dst_ty {
|
||||||
|
types::I16X8 => match op {
|
||||||
|
Opcode::Ishl => SseOpcode::Psllw,
|
||||||
|
Opcode::Ushr => SseOpcode::Psrlw,
|
||||||
|
Opcode::Sshr => SseOpcode::Psraw,
|
||||||
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
|
},
|
||||||
|
types::I32X4 => match op {
|
||||||
|
Opcode::Ishl => SseOpcode::Pslld,
|
||||||
|
Opcode::Ushr => SseOpcode::Psrld,
|
||||||
|
Opcode::Sshr => SseOpcode::Psrad,
|
||||||
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
|
},
|
||||||
|
types::I64X2 => match op {
|
||||||
|
Opcode::Ishl => SseOpcode::Psllq,
|
||||||
|
Opcode::Ushr => SseOpcode::Psrlq,
|
||||||
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// If necessary, move the shift index into the lowest bits of a vector register.
|
||||||
|
let shift_by = match shift_by {
|
||||||
|
RegMemImm::Imm { .. } => shift_by,
|
||||||
|
RegMemImm::Reg { reg } => {
|
||||||
|
let tmp_shift_by = ctx.alloc_tmp(RegClass::V128, dst_ty);
|
||||||
|
ctx.emit(Inst::gpr_to_xmm(
|
||||||
|
SseOpcode::Movd,
|
||||||
|
RegMem::reg(reg),
|
||||||
|
OperandSize::Size32,
|
||||||
|
tmp_shift_by,
|
||||||
|
));
|
||||||
|
RegMemImm::reg(tmp_shift_by.to_reg())
|
||||||
|
}
|
||||||
|
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Move the `src` to the same register as `dst`.
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
||||||
|
|
||||||
|
ctx.emit(Inst::xmm_rmi_reg(sse_op, shift_by, dst));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Ineg => {
|
Opcode::Ineg => {
|
||||||
@@ -3329,40 +3668,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
};
|
};
|
||||||
debug_assert!(lane < src_ty.lane_count() as u8);
|
debug_assert!(lane < src_ty.lane_count() as u8);
|
||||||
|
|
||||||
if !ty.is_float() {
|
emit_extract_lane(ctx, src, dst, lane, ty);
|
||||||
let (sse_op, w_bit) = match ty.lane_bits() {
|
|
||||||
8 => (SseOpcode::Pextrb, false),
|
|
||||||
16 => (SseOpcode::Pextrw, false),
|
|
||||||
32 => (SseOpcode::Pextrd, false),
|
|
||||||
64 => (SseOpcode::Pextrd, true),
|
|
||||||
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
|
|
||||||
};
|
|
||||||
let src = RegMem::reg(src);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
|
|
||||||
} else {
|
|
||||||
if lane == 0 {
|
|
||||||
// Remove the extractlane instruction, leaving the float where it is. The upper
|
|
||||||
// bits will remain unchanged; for correctness, this relies on Cranelift type
|
|
||||||
// checking to avoid using those bits.
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
||||||
} else {
|
|
||||||
// Otherwise, shuffle the bits in `lane` to the lowest lane.
|
|
||||||
let sse_op = SseOpcode::Pshufd;
|
|
||||||
let mask = match src_ty {
|
|
||||||
// Move the value at `lane` to lane 0, copying existing value at lane 0 to
|
|
||||||
// other lanes. Again, this relies on Cranelift type checking to avoid
|
|
||||||
// using those bits.
|
|
||||||
types::F32X4 => 0b00_00_00_00 | lane,
|
|
||||||
// Move the value at `lane` 1 (we know it must be 1 because of the `if`
|
|
||||||
// statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
|
|
||||||
// checking assumption also applies here.
|
|
||||||
types::F64X2 => 0b11_10_11_10,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let src = RegMem::reg(src);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Splat | Opcode::LoadSplat => {
|
Opcode::Splat | Opcode::LoadSplat => {
|
||||||
|
|||||||
@@ -15,3 +15,78 @@ block0:
|
|||||||
; nextln: pandn %xmm2, %xmm0
|
; nextln: pandn %xmm2, %xmm0
|
||||||
; nextln: por %xmm1, %xmm0
|
; nextln: por %xmm1, %xmm0
|
||||||
; not: movdqa
|
; not: movdqa
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; 8x16 shifts: these lower to complex sequences of instructions
|
||||||
|
|
||||||
|
function %ishl_i8x16(i32) -> i8x16 {
|
||||||
|
block0(v0: i32):
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; check: movd %edi, %xmm1
|
||||||
|
; nextln: psllw %xmm1, %xmm0
|
||||||
|
; nextln: lea const(VCodeConstant(0)), %r12
|
||||||
|
; nextln: shlq $$4, %rdi
|
||||||
|
; nextln: movdqu 0(%r12,%rdi,1), %xmm1
|
||||||
|
; nextln: pand %xmm1, %xmm0
|
||||||
|
|
||||||
|
function %ushr_i8x16_imm() -> i8x16 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = ushr v1, v0
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; check: load_const VCodeConstant(1), %xmm0
|
||||||
|
; nextln: psrlw $$1, %xmm0
|
||||||
|
; nextln: movdqu const(VCodeConstant(0)), %xmm1
|
||||||
|
; nextln: pand %xmm1, %xmm0
|
||||||
|
|
||||||
|
function %sshr_i8x16(i32) -> i8x16 {
|
||||||
|
block0(v0: i32):
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = sshr v1, v0
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; check: addl $$8, %edi
|
||||||
|
; nextln: movd %edi, %xmm2
|
||||||
|
; nextln: movdqa %xmm0, %xmm1
|
||||||
|
; nextln: punpcklbw %xmm1, %xmm1
|
||||||
|
; nextln: psraw %xmm2, %xmm1
|
||||||
|
; nextln: punpckhbw %xmm0, %xmm0
|
||||||
|
; nextln: psraw %xmm2, %xmm0
|
||||||
|
|
||||||
|
function %sshr_i8x16_imm(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = sshr_imm v0, 3
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; check: movdqa %xmm0, %xmm1
|
||||||
|
; nextln: movdqa %xmm1, %xmm0
|
||||||
|
; nextln: punpcklbw %xmm0, %xmm0
|
||||||
|
; nextln: psraw $$11, %xmm0
|
||||||
|
; nextln: punpckhbw %xmm1, %xmm1
|
||||||
|
; nextln: psraw $$11, %xmm1
|
||||||
|
; nextln: packsswb %xmm1, %xmm0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; i16x4 arithmetic shifts: x86 does not have a instruction for this
|
||||||
|
|
||||||
|
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; check: pextrd.w $$0, %xmm0, %r12
|
||||||
|
; nextln: pextrd.w $$1, %xmm0, %r13
|
||||||
|
; nextln: movq %rdi, %rcx
|
||||||
|
; nextln: sarq %cl, %r12
|
||||||
|
; nextln: movq %rdi, %rcx
|
||||||
|
; nextln: sarq %cl, %r13
|
||||||
|
; nextln: pinsrd.w $$0, %r12, %xmm1
|
||||||
|
; nextln: pinsrd.w $$1, %r13, %xmm1
|
||||||
|
; nextln: movdqa %xmm1, %xmm0
|
||||||
|
|||||||
@@ -10,3 +10,111 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
|
|||||||
}
|
}
|
||||||
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
||||||
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; shift left
|
||||||
|
|
||||||
|
function %ishl_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
|
||||||
|
|
||||||
|
function %ishl_i16x8(i16x8, i32) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
|
||||||
|
|
||||||
|
function %ishl_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
|
||||||
|
|
||||||
|
function %ishl_imm_i64x2(i64x2) -> i64x2 {
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v2 = ishl_imm v0, 1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_imm_i64x2([1 0]) == [2 0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; shift right (logical)
|
||||||
|
|
||||||
|
function %ushr_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
||||||
|
|
||||||
|
function %ushr_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
|
||||||
|
|
||||||
|
function %ushr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i64x2([1 2], 1) == [0 1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; shift right (arithmetic)
|
||||||
|
|
||||||
|
function %sshr_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
||||||
|
|
||||||
|
function %sshr_i16x8(i16x8, i32) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
|
||||||
|
; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
|
||||||
|
|
||||||
|
function %sshr_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
|
||||||
|
; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
|
||||||
|
|
||||||
|
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0:i64x2, v1:i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %sshr_i64x2([1 -1], 0) == [1 -1]
|
||||||
|
; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
|
||||||
|
; run: %sshr_i64x2([2 -2], 1) == [1 -1]
|
||||||
|
; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
|
||||||
|
|
||||||
|
function %sshr_imm_i32x4(i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = sshr_imm v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
|
||||||
|
|
||||||
|
function %sshr_imm_i16x8(i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = sshr_imm v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
|
||||||
|
|||||||
Reference in New Issue
Block a user