cranelift: Port rotr lowering to ISLE on x64

This commit is contained in:
Nick Fitzgerald
2022-01-13 13:22:48 -08:00
parent 4120e40318
commit a41fdb0303
6 changed files with 422 additions and 678 deletions

View File

@@ -877,316 +877,6 @@ fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>,
ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
}
fn emit_shl_i128<C: LowerCtx<I = Inst>>(
ctx: &mut C,
src: ValueRegs<Reg>,
dst: ValueRegs<Writable<Reg>>,
amt_src: Reg,
) {
let src_lo = src.regs()[0];
let src_hi = src.regs()[1];
let dst_lo = dst.regs()[0];
let dst_hi = dst.regs()[1];
// mov tmp1, src_lo
// shl tmp1, amt_src
// mov tmp2, src_hi
// shl tmp2, amt_src
// mov amt, 64
// sub amt, amt_src
// mov tmp3, src_lo
// shr tmp3, amt
// xor dst_lo, dst_lo
// test amt_src, 127
// cmovz tmp3, dst_lo
// or tmp3, tmp2
// mov amt, amt_src
// and amt, 64
// cmovz dst_hi, tmp3
// cmovz dst_lo, tmp1
// cmovnz dst_hi, tmp1
let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt_src,
types::I64,
));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftLeft,
None,
tmp1,
));
ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt_src,
types::I64,
));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftLeft,
None,
tmp2,
));
ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::reg(amt_src),
amt,
));
ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt.to_reg(),
types::I64,
));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftRightLogical,
None,
tmp3,
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(dst_lo.to_reg()),
dst_lo,
));
ctx.emit(Inst::test_rmi_r(
OperandSize::Size64,
RegMemImm::imm(127),
amt_src,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(dst_lo.to_reg()),
tmp3,
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp2.to_reg()),
tmp3,
));
// This isn't semantically necessary, but it keeps the
// register allocator happy, because it cannot otherwise
// infer that cmovz + cmovnz always defines dst_hi.
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(dst_hi.to_reg()),
dst_hi,
));
ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::And,
RegMemImm::imm(64),
amt,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(tmp3.to_reg()),
dst_hi,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(tmp1.to_reg()),
dst_lo,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::NZ,
RegMem::reg(tmp1.to_reg()),
dst_hi,
));
}
fn emit_shr_i128<C: LowerCtx<I = Inst>>(
ctx: &mut C,
src: ValueRegs<Reg>,
dst: ValueRegs<Writable<Reg>>,
amt_src: Reg,
is_signed: bool,
) {
let src_lo = src.regs()[0];
let src_hi = src.regs()[1];
let dst_lo = dst.regs()[0];
let dst_hi = dst.regs()[1];
// mov tmp1, src_hi
// {u,s}shr tmp1, amt_src
// mov tmp2, src_lo
// ushr tmp2, amt_src
// mov amt, 64
// sub amt, amt_src
// mov tmp3, src_hi
// shl tmp3, amt
// xor dst_lo, dst_lo
// test amt_src, 127
// cmovz tmp3, dst_lo
// or tmp3, tmp2
// if is_signed:
// mov dst_hi, src_hi
// sshr dst_hi, 63 // get the sign bit
// else:
// xor dst_hi, dst_hi
// mov amt, amt_src
// and amt, 64
// cmovz dst_hi, tmp1
// cmovz dst_lo, tmp3
// cmovnz dst_lo, tmp1
let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
let shift_kind = if is_signed {
ShiftKind::ShiftRightArithmetic
} else {
ShiftKind::ShiftRightLogical
};
ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt_src,
types::I64,
));
ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));
ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt_src,
types::I64,
));
// N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftRightLogical,
None,
tmp2,
));
ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::reg(amt_src),
amt,
));
ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
ctx.emit(Inst::gen_move(
Writable::from_reg(regs::rcx()),
amt.to_reg(),
types::I64,
));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftLeft,
None,
tmp3,
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(dst_lo.to_reg()),
dst_lo,
));
ctx.emit(Inst::test_rmi_r(
OperandSize::Size64,
RegMemImm::imm(127),
amt_src,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(dst_lo.to_reg()),
tmp3,
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp2.to_reg()),
tmp3,
));
if is_signed {
ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
ctx.emit(Inst::shift_r(
OperandSize::Size64,
ShiftKind::ShiftRightArithmetic,
Some(63),
dst_hi,
));
} else {
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(dst_hi.to_reg()),
dst_hi,
));
}
// This isn't semantically necessary, but it keeps the
// register allocator happy, because it cannot otherwise
// infer that cmovz + cmovnz always defines dst_lo.
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(dst_lo.to_reg()),
dst_lo,
));
ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::And,
RegMemImm::imm(64),
amt,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(tmp1.to_reg()),
dst_hi,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::Z,
RegMem::reg(tmp3.to_reg()),
dst_lo,
));
ctx.emit(Inst::cmove(
OperandSize::Size64,
CC::NZ,
RegMem::reg(tmp1.to_reg()),
dst_lo,
));
}
fn make_libcall_sig<C: LowerCtx<I = Inst>>(
ctx: &mut C,
insn: IRInst,
@@ -1542,99 +1232,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Ushr
| Opcode::Sshr
| Opcode::Ishl
| Opcode::Rotl => implemented_in_isle(ctx),
Opcode::Rotr => {
let dst_ty = ctx.output_ty(insn, 0);
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
// Scalar shifts on x86 have various encodings:
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
// - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
// This implementation uses the last two encoding methods.
let (size, lhs) = match dst_ty {
types::I8 | types::I16 => match op {
Opcode::Rotr => (
OperandSize::from_ty(dst_ty),
put_input_in_reg(ctx, inputs[0]),
),
_ => unreachable!(),
},
types::I32 | types::I64 => (
OperandSize::from_ty(dst_ty),
put_input_in_reg(ctx, inputs[0]),
),
_ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
};
let (count, rhs) =
if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
// Mask count, according to Cranelift's semantics.
let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
(Some(cst), None)
} else {
// We can ignore upper registers if shift amount is multi-reg, because we
// are taking the shift amount mod 2^(lhs_width) anyway.
(None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
};
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let shift_kind = match op {
Opcode::Rotr => ShiftKind::RotateRight,
_ => unreachable!(),
};
let w_rcx = Writable::from_reg(regs::rcx());
ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
if count.is_none() {
ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
}
ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
} else if dst_ty == types::I128 {
let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
let src = put_input_in_regs(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]);
match op {
Opcode::Rotr => {
// (mov tmp, src)
// (ushr.i128 tmp, amt)
// (mov dst, src)
// (shl.i128 dst, 128-amt)
// (or dst, tmp)
let tmp = ctx.alloc_tmp(types::I128);
emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::reg(amt_src),
inv_amt,
));
emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[0].to_reg()),
dst.regs()[0],
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[1].to_reg()),
dst.regs()[1],
));
}
_ => unreachable!(),
}
} else {
implemented_in_isle(ctx);
}
}
| Opcode::Rotl
| Opcode::Rotr => implemented_in_isle(ctx),
Opcode::Ineg => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();