aarch64: Translate rot{r,l} to ISLE (#3614)
This commit translates the `rotl` and `rotr` lowerings already existing to ISLE. The port was relatively straightforward with the biggest changing being the instructions generated around i128 rotl/rotr primarily due to register changes.
This commit is contained in:
@@ -90,248 +90,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
|
||||
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => implemented_in_isle(ctx),
|
||||
|
||||
Opcode::Rotr | Opcode::Rotl => {
|
||||
// aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
|
||||
// effectively a right rotation of N - K places, if N is the integer's bit size. We
|
||||
// implement left rotations with this trick.
|
||||
//
|
||||
// For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
|
||||
//
|
||||
// For a < 32-bit rotate-right, we synthesize this as:
|
||||
//
|
||||
// rotr rd, rn, rm
|
||||
//
|
||||
// =>
|
||||
//
|
||||
// zero-extend rn, <32-or-64>
|
||||
// and tmp_masked_rm, rm, <bitwidth - 1>
|
||||
// sub tmp1, tmp_masked_rm, <bitwidth>
|
||||
// sub tmp1, zero, tmp1 ; neg
|
||||
// lsr tmp2, rn, tmp_masked_rm
|
||||
// lsl rd, rn, tmp1
|
||||
// orr rd, rd, tmp2
|
||||
//
|
||||
// For a constant amount, we can instead do:
|
||||
//
|
||||
// zero-extend rn, <32-or-64>
|
||||
// lsr tmp2, rn, #<shiftimm>
|
||||
// lsl rd, rn, <bitwidth - shiftimm>
|
||||
// orr rd, rd, tmp2
|
||||
|
||||
let is_rotl = op == Opcode::Rotl;
|
||||
|
||||
let ty = ty.unwrap();
|
||||
let ty_bits_size = ty_bits(ty) as u8;
|
||||
|
||||
if ty.is_vector() {
|
||||
return Err(CodegenError::Unsupported(format!(
|
||||
"{}: Unsupported type: {:?}",
|
||||
op, ty
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: We can do much better codegen if we have a constant amt
|
||||
if ty == I128 {
|
||||
let dst = get_output_reg(ctx, outputs[0]);
|
||||
let src = put_input_in_regs(ctx, inputs[0]);
|
||||
let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
|
||||
|
||||
let tmp = ctx.alloc_tmp(I128);
|
||||
let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap();
|
||||
|
||||
lower_constant_u64(ctx, inv_amt, 128);
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Sub64,
|
||||
rd: inv_amt,
|
||||
rn: inv_amt.to_reg(),
|
||||
rm: amt_src,
|
||||
});
|
||||
|
||||
if is_rotl {
|
||||
// rotl
|
||||
// (shl.i128 tmp, amt)
|
||||
// (ushr.i128 dst, 128-amt)
|
||||
|
||||
emit_shl_i128(ctx, src, tmp, amt_src);
|
||||
emit_shr_i128(
|
||||
ctx,
|
||||
src,
|
||||
dst,
|
||||
inv_amt.to_reg(),
|
||||
/* is_signed = */ false,
|
||||
);
|
||||
} else {
|
||||
// rotr
|
||||
// (ushr.i128 tmp, amt)
|
||||
// (shl.i128 dst, 128-amt)
|
||||
|
||||
emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
|
||||
emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
|
||||
}
|
||||
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Orr64,
|
||||
rd: dst.regs()[0],
|
||||
rn: dst.regs()[0].to_reg(),
|
||||
rm: tmp.regs()[0].to_reg(),
|
||||
});
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Orr64,
|
||||
rd: dst.regs()[1],
|
||||
rn: dst.regs()[1].to_reg(),
|
||||
rm: tmp.regs()[1].to_reg(),
|
||||
});
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let rn = put_input_in_reg(
|
||||
ctx,
|
||||
inputs[0],
|
||||
if ty_bits_size <= 32 {
|
||||
NarrowValueMode::ZeroExtend32
|
||||
} else {
|
||||
NarrowValueMode::ZeroExtend64
|
||||
},
|
||||
);
|
||||
let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
|
||||
|
||||
if ty_bits_size == 32 || ty_bits_size == 64 {
|
||||
let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
|
||||
match rm {
|
||||
ResultRegImmShift::ImmShift(mut immshift) => {
|
||||
if is_rotl {
|
||||
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
|
||||
}
|
||||
immshift.imm &= ty_bits_size - 1;
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op,
|
||||
rd,
|
||||
rn,
|
||||
immshift,
|
||||
});
|
||||
}
|
||||
|
||||
ResultRegImmShift::Reg(rm) => {
|
||||
let rm = if is_rotl {
|
||||
// Really ty_bits_size - rn, but the upper bits of the result are
|
||||
// ignored (because of the implicit masking done by the instruction),
|
||||
// so this is equivalent to negating the input.
|
||||
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
|
||||
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op,
|
||||
rd: tmp,
|
||||
rn: zero_reg(),
|
||||
rm,
|
||||
});
|
||||
tmp.to_reg()
|
||||
} else {
|
||||
rm
|
||||
};
|
||||
ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug_assert!(ty_bits_size < 32);
|
||||
|
||||
match rm {
|
||||
ResultRegImmShift::Reg(reg) => {
|
||||
let reg = if is_rotl {
|
||||
// Really ty_bits_size - rn, but the upper bits of the result are
|
||||
// ignored (because of the implicit masking done by the instruction),
|
||||
// so this is equivalent to negating the input.
|
||||
let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Sub32,
|
||||
rd: tmp,
|
||||
rn: zero_reg(),
|
||||
rm: reg,
|
||||
});
|
||||
tmp.to_reg()
|
||||
} else {
|
||||
reg
|
||||
};
|
||||
|
||||
// Explicitly mask the rotation count.
|
||||
let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
|
||||
ctx.emit(Inst::AluRRImmLogic {
|
||||
alu_op: ALUOp::And32,
|
||||
rd: tmp_masked_rm,
|
||||
rn: reg,
|
||||
imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
|
||||
});
|
||||
let tmp_masked_rm = tmp_masked_rm.to_reg();
|
||||
|
||||
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
||||
let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::Sub32,
|
||||
rd: tmp1,
|
||||
rn: tmp_masked_rm,
|
||||
imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
|
||||
});
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Sub32,
|
||||
rd: tmp1,
|
||||
rn: zero_reg(),
|
||||
rm: tmp1.to_reg(),
|
||||
});
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Lsr32,
|
||||
rd: tmp2,
|
||||
rn,
|
||||
rm: tmp_masked_rm,
|
||||
});
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Lsl32,
|
||||
rd,
|
||||
rn,
|
||||
rm: tmp1.to_reg(),
|
||||
});
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Orr32,
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
rm: tmp2.to_reg(),
|
||||
});
|
||||
}
|
||||
|
||||
ResultRegImmShift::ImmShift(mut immshift) => {
|
||||
if is_rotl {
|
||||
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
|
||||
}
|
||||
immshift.imm &= ty_bits_size - 1;
|
||||
|
||||
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsr32,
|
||||
rd: tmp1,
|
||||
rn,
|
||||
immshift: immshift.clone(),
|
||||
});
|
||||
|
||||
let amount = immshift.value() & (ty_bits_size - 1);
|
||||
let opp_shift =
|
||||
ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsl32,
|
||||
rd,
|
||||
rn,
|
||||
immshift: opp_shift,
|
||||
});
|
||||
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Orr32,
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
rm: tmp1.to_reg(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Opcode::Rotr | Opcode::Rotl => implemented_in_isle(ctx),
|
||||
|
||||
Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
|
||||
let ty = ty.unwrap();
|
||||
|
||||
Reference in New Issue
Block a user