aarch64: Translate rot{r,l} to ISLE (#3614)

This commit translates the `rotl` and `rotr` lowerings already existing
to ISLE. The port was relatively straightforward with the biggest
changing being the instructions generated around i128 rotl/rotr
primarily due to register changes.
This commit is contained in:
Alex Crichton
2021-12-17 12:37:17 -06:00
committed by GitHub
parent d8974ce6bc
commit e94ebc2263
9 changed files with 610 additions and 519 deletions

View File

@@ -90,248 +90,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => implemented_in_isle(ctx),
Opcode::Rotr | Opcode::Rotl => {
// aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
// effectively a right rotation of N - K places, if N is the integer's bit size. We
// implement left rotations with this trick.
//
// For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
//
// For a < 32-bit rotate-right, we synthesize this as:
//
// rotr rd, rn, rm
//
// =>
//
// zero-extend rn, <32-or-64>
// and tmp_masked_rm, rm, <bitwidth - 1>
// sub tmp1, tmp_masked_rm, <bitwidth>
// sub tmp1, zero, tmp1 ; neg
// lsr tmp2, rn, tmp_masked_rm
// lsl rd, rn, tmp1
// orr rd, rd, tmp2
//
// For a constant amount, we can instead do:
//
// zero-extend rn, <32-or-64>
// lsr tmp2, rn, #<shiftimm>
// lsl rd, rn, <bitwidth - shiftimm>
// orr rd, rd, tmp2
let is_rotl = op == Opcode::Rotl;
let ty = ty.unwrap();
let ty_bits_size = ty_bits(ty) as u8;
if ty.is_vector() {
return Err(CodegenError::Unsupported(format!(
"{}: Unsupported type: {:?}",
op, ty
)));
}
// TODO: We can do much better codegen if we have a constant amt
if ty == I128 {
let dst = get_output_reg(ctx, outputs[0]);
let src = put_input_in_regs(ctx, inputs[0]);
let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
let tmp = ctx.alloc_tmp(I128);
let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
lower_constant_u64(ctx, inv_amt, 128);
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Sub64,
rd: inv_amt,
rn: inv_amt.to_reg(),
rm: amt_src,
});
if is_rotl {
// rotl
// (shl.i128 tmp, amt)
// (ushr.i128 dst, 128-amt)
emit_shl_i128(ctx, src, tmp, amt_src);
emit_shr_i128(
ctx,
src,
dst,
inv_amt.to_reg(),
/* is_signed = */ false,
);
} else {
// rotr
// (ushr.i128 tmp, amt)
// (shl.i128 dst, 128-amt)
emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
}
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr64,
rd: dst.regs()[0],
rn: dst.regs()[0].to_reg(),
rm: tmp.regs()[0].to_reg(),
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr64,
rd: dst.regs()[1],
rn: dst.regs()[1].to_reg(),
rm: tmp.regs()[1].to_reg(),
});
return Ok(());
}
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(
ctx,
inputs[0],
if ty_bits_size <= 32 {
NarrowValueMode::ZeroExtend32
} else {
NarrowValueMode::ZeroExtend64
},
);
let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
if ty_bits_size == 32 || ty_bits_size == 64 {
let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
match rm {
ResultRegImmShift::ImmShift(mut immshift) => {
if is_rotl {
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
}
immshift.imm &= ty_bits_size - 1;
ctx.emit(Inst::AluRRImmShift {
alu_op,
rd,
rn,
immshift,
});
}
ResultRegImmShift::Reg(rm) => {
let rm = if is_rotl {
// Really ty_bits_size - rn, but the upper bits of the result are
// ignored (because of the implicit masking done by the instruction),
// so this is equivalent to negating the input.
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
ctx.emit(Inst::AluRRR {
alu_op,
rd: tmp,
rn: zero_reg(),
rm,
});
tmp.to_reg()
} else {
rm
};
ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
}
}
} else {
debug_assert!(ty_bits_size < 32);
match rm {
ResultRegImmShift::Reg(reg) => {
let reg = if is_rotl {
// Really ty_bits_size - rn, but the upper bits of the result are
// ignored (because of the implicit masking done by the instruction),
// so this is equivalent to negating the input.
let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Sub32,
rd: tmp,
rn: zero_reg(),
rm: reg,
});
tmp.to_reg()
} else {
reg
};
// Explicitly mask the rotation count.
let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
ctx.emit(Inst::AluRRImmLogic {
alu_op: ALUOp::And32,
rd: tmp_masked_rm,
rn: reg,
imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
});
let tmp_masked_rm = tmp_masked_rm.to_reg();
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::Sub32,
rd: tmp1,
rn: tmp_masked_rm,
imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Sub32,
rd: tmp1,
rn: zero_reg(),
rm: tmp1.to_reg(),
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Lsr32,
rd: tmp2,
rn,
rm: tmp_masked_rm,
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Lsl32,
rd,
rn,
rm: tmp1.to_reg(),
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr32,
rd,
rn: rd.to_reg(),
rm: tmp2.to_reg(),
});
}
ResultRegImmShift::ImmShift(mut immshift) => {
if is_rotl {
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
}
immshift.imm &= ty_bits_size - 1;
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr32,
rd: tmp1,
rn,
immshift: immshift.clone(),
});
let amount = immshift.value() & (ty_bits_size - 1);
let opp_shift =
ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsl32,
rd,
rn,
immshift: opp_shift,
});
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr32,
rd,
rn: rd.to_reg(),
rm: tmp1.to_reg(),
});
}
}
}
}
Opcode::Rotr | Opcode::Rotl => implemented_in_isle(ctx),
Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
let ty = ty.unwrap();