cranelift: port sshr to ISLE on x64 (#3681)

This commit is contained in:
Nick Fitzgerald
2022-01-12 07:13:58 -08:00
committed by GitHub
parent 1ef0abb12c
commit 7454f1f3af
13 changed files with 1003 additions and 563 deletions

View File

@@ -1538,13 +1538,18 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Umin
| Opcode::Bnot
| Opcode::Bitselect
| Opcode::Vselect => implemented_in_isle(ctx),
| Opcode::Vselect
| Opcode::Sshr => implemented_in_isle(ctx),
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
let dst_ty = ctx.output_ty(insn, 0);
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
if op != Opcode::Rotr {
implemented_in_isle(ctx);
}
// Scalar shifts on x86 have various encodings:
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1557,10 +1562,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
),
Opcode::Sshr => (
OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
),
Opcode::Rotl | Opcode::Rotr => (
OperandSize::from_ty(dst_ty),
put_input_in_reg(ctx, inputs[0]),
@@ -1590,7 +1591,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let shift_kind = match op {
Opcode::Ishl => ShiftKind::ShiftLeft,
Opcode::Ushr => ShiftKind::ShiftRightLogical,
Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
Opcode::Rotl => ShiftKind::RotateLeft,
Opcode::Rotr => ShiftKind::RotateRight,
_ => unreachable!(),
@@ -1608,50 +1608,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]);
match op {
Opcode::Ishl => {
emit_shl_i128(ctx, src, dst, amt_src);
}
Opcode::Ushr => {
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
}
Opcode::Sshr => {
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
}
Opcode::Rotl => {
// (mov tmp, src)
// (shl.i128 tmp, amt)
// (mov dst, src)
// (ushr.i128 dst, 128-amt)
// (or dst, tmp)
let tmp = ctx.alloc_tmp(types::I128);
emit_shl_i128(ctx, src, tmp, amt_src);
let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::reg(amt_src),
inv_amt,
));
emit_shr_i128(
ctx,
src,
dst,
inv_amt.to_reg(),
/* is_signed = */ false,
);
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[0].to_reg()),
dst.regs()[0],
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[1].to_reg()),
dst.regs()[1],
));
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
implemented_in_isle(ctx);
}
Opcode::Rotr => {
// (mov tmp, src)
@@ -1808,127 +1766,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
_ => SseOpcode::Pand,
};
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
} else if dst_ty == types::I8X16 && op == Opcode::Sshr {
// Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
// `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
// approach here: separate the low and high lanes, shift them separately, and merge them into the final
// result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
// low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
// shifted_low.i16x8 = shift each lane of `low`
// high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
// shifted_high.i16x8 = shift each lane of `high`
// dst.i8x16 = [s0'', s1'', ..., s15'']
let src = put_input_in_reg(ctx, inputs[0]);
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
let shift_by_ty = ctx.input_ty(insn, 1);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
// In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
// bits, relying on PSRAW to fill in the upper bits appropriately.
let bigger_shift_by = match shift_by {
// When we know the shift amount at compile time, we add the extra shift amount statically.
RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
// Otherwise we add instructions to add the extra shift amount and move the value into an XMM
// register.
RegMemImm::Reg { reg } => {
let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
let size = if shift_by_ty == types::I64 {
OperandSize::Size64
} else {
OperandSize::Size32
};
let imm = RegMemImm::imm(8);
ctx.emit(Inst::alu_rmi_r(
size,
AluRmiROpcode::Add,
imm,
bigger_shift_by_gpr,
));
let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
ctx.emit(Inst::gpr_to_xmm(
SseOpcode::Movd,
RegMem::from(bigger_shift_by_gpr),
OperandSize::Size32,
bigger_shift_by_xmm,
));
RegMemImm::reg(bigger_shift_by_xmm.to_reg())
}
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
};
// Unpack and shift the lower lanes of `src` into the `dst` register.
ctx.emit(Inst::gen_move(dst, src, dst_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
ctx.emit(Inst::xmm_rmi_reg(
SseOpcode::Psraw,
bigger_shift_by.clone(),
dst,
));
// Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpckhbw,
RegMem::from(upper_lanes),
upper_lanes,
));
ctx.emit(Inst::xmm_rmi_reg(
SseOpcode::Psraw,
bigger_shift_by,
upper_lanes,
));
// Merge the upper and lower shifted lanes into `dst`.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Packsswb,
RegMem::from(upper_lanes),
dst,
));
} else if dst_ty == types::I64X2 && op == Opcode::Sshr {
// The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
// like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
// does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
// alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
// scalar instruction, and insert the shifted values back in the `dst` XMM register.
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, dst_ty));
// Extract the upper and lower lanes into temporary GPRs.
let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
// Shift each value.
let mut shift = |reg: Writable<Reg>| {
let kind = ShiftKind::ShiftRightArithmetic;
if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
// Mask the shift amount according to Cranelift's semantics.
let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
ctx.emit(Inst::shift_r(
OperandSize::Size64,
kind,
Some(shift_by),
reg,
));
} else {
let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
let w_rcx = Writable::from_reg(regs::rcx());
ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
};
};
shift(lower_lane);
shift(upper_lane);
// Insert the scalar values back into the `dst` vector.
emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
} else {
// For the remaining packed shifts not covered above, x86 has implementations that can either:
// - shift using an immediate
@@ -1940,13 +1777,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
types::I16X8 => match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw,
Opcode::Sshr => SseOpcode::Psraw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I32X4 => match op {
Opcode::Ishl => SseOpcode::Pslld,
Opcode::Ushr => SseOpcode::Psrld,
Opcode::Sshr => SseOpcode::Psrad,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I64X2 => match op {