Allocate temporary intermediates when loading constants on aarch64 (#5366)

As loading constants on aarch64 can take up to 4 instructions, we need to plumb through some additional registers. Rather than pass a fixed list of registers in, pass an allocation function.
This commit is contained in:
Trevor Elliott
2022-12-01 14:29:36 -08:00
committed by GitHub
parent 03715dda9d
commit d54a27d0ea
11 changed files with 158 additions and 126 deletions

View File

@@ -430,7 +430,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
} else {
let scratch2 = writable_tmp2_reg();
assert_ne!(scratch2.to_reg(), from_reg);
insts.extend(Inst::load_constant(scratch2, imm.into()));
// `gen_add_imm` is only ever called after register allocation has take place, and as a
// result it's ok to reuse the scratch2 register here. If that changes, we'll need to
// plumb through a way to allocate temporary virtual registers
insts.extend(Inst::load_constant(scratch2, imm.into(), &mut |_| scratch2));
insts.push(Inst::AluRRRExtend {
alu_op: ALUOp::Add,
size: OperandSize::Size64,
@@ -515,7 +518,9 @@ impl ABIMachineSpec for AArch64MachineDeps {
ret.push(adj_inst);
} else {
let tmp = writable_spilltmp_reg();
let const_inst = Inst::load_constant(tmp, amount);
// `gen_sp_reg_adjust` is called after regalloc2, so it's acceptable to reuse `tmp` for
// intermediates in `load_constant`.
let const_inst = Inst::load_constant(tmp, amount, &mut |_| tmp);
let adj_inst = Inst::AluRRRExtend {
alu_op,
size: OperandSize::Size64,
@@ -673,8 +678,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
// itself is not allowed to use the registers.
let start = writable_spilltmp_reg();
let end = writable_tmp2_reg();
insts.extend(Inst::load_constant(start, 0));
insts.extend(Inst::load_constant(end, frame_size.into()));
// `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
// `start` and `end` as temporaries in load_constant.
insts.extend(Inst::load_constant(start, 0, &mut |_| start));
insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
insts.push(Inst::StackProbeLoop {
start,
end: end.to_reg(),
@@ -1019,19 +1026,19 @@ impl ABIMachineSpec for AArch64MachineDeps {
insts
}
fn gen_memcpy(
fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
call_conv: isa::CallConv,
dst: Reg,
src: Reg,
tmp: Writable<Reg>,
_tmp2: Writable<Reg>,
size: usize,
mut alloc_tmp: F,
) -> SmallVec<[Self::I; 8]> {
let mut insts = SmallVec::new();
let arg0 = writable_xreg(0);
let arg1 = writable_xreg(1);
let arg2 = writable_xreg(2);
insts.extend(Inst::load_constant(tmp, size as u64).into_iter());
let tmp = alloc_tmp(Self::word_type());
insts.extend(Inst::load_constant(tmp, size as u64, &mut alloc_tmp));
insts.push(Inst::Call {
info: Box::new(CallInfo {
dest: ExternalName::LibCall(LibCall::Memcpy),

View File

@@ -64,7 +64,7 @@ pub fn mem_finalize(
} else {
let tmp = writable_spilltmp_reg();
(
Inst::load_constant(tmp, off as u64),
Inst::load_constant(tmp, off as u64, &mut |_| tmp),
AMode::RegExtended {
rn: basereg,
rm: tmp.to_reg(),
@@ -3333,7 +3333,7 @@ impl MachInstEmit for Inst {
debug_assert!(rd.to_reg() != tmp2_reg());
debug_assert!(reg != tmp2_reg());
let tmp = writable_tmp2_reg();
for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
for insn in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
insn.emit(&[], sink, emit_info, state);
}
let add = Inst::AluRRR {

View File

@@ -130,7 +130,11 @@ fn inst_size_test() {
impl Inst {
/// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
/// logical immediate, or constant pool).
pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
pub fn load_constant<F: FnMut(Type) -> Writable<Reg>>(
rd: Writable<Reg>,
value: u64,
alloc_tmp: &mut F,
) -> SmallVec<[Inst; 4]> {
// NB: this is duplicated in `lower/isle.rs` and `inst.isle` right now,
// if modifications are made here before this is deleted after moving to
// ISLE then those locations should be updated as well.
@@ -169,23 +173,40 @@ impl Inst {
} else {
(4, OperandSize::Size64, !value)
};
// If the number of 0xffff half words is greater than the number of 0x0000 half words
// it is more efficient to use `movn` for the first instruction.
let first_is_inverted = count_zero_half_words(negated, num_half_words)
> count_zero_half_words(value, num_half_words);
// Either 0xffff or 0x0000 half words can be skipped, depending on the first
// instruction used.
let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
let mut first_mov_emitted = false;
for i in 0..num_half_words {
let halfwords: SmallVec<[_; 4]> = (0..num_half_words)
.filter_map(|i| {
let imm16 = (value >> (16 * i)) & 0xffff;
if imm16 != ignored_halfword {
if !first_mov_emitted {
first_mov_emitted = true;
if imm16 == ignored_halfword {
None
} else {
Some((i, imm16))
}
})
.collect();
let mut prev_result = None;
let last_index = halfwords.last().unwrap().0;
for (i, imm16) in halfwords {
let shift = i * 16;
let rd = if i == last_index { rd } else { alloc_tmp(I16) };
if let Some(rn) = prev_result {
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
insts.push(Inst::MovK { rd, rn, imm, size });
} else {
if first_is_inverted {
let imm =
MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift)
.unwrap();
insts.push(Inst::MovWide {
op: MoveWideOp::MovN,
@@ -194,8 +215,7 @@ impl Inst {
size,
});
} else {
let imm =
MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
insts.push(Inst::MovWide {
op: MoveWideOp::MovZ,
rd,
@@ -203,26 +223,23 @@ impl Inst {
size,
});
}
} else {
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
insts.push(Inst::MovK {
rd,
rn: rd.to_reg(), // Redef the same virtual register.
imm,
size,
});
}
}
}
assert!(first_mov_emitted);
prev_result = Some(rd.to_reg());
}
assert!(prev_result.is_some());
insts
}
}
/// Create instructions that load a 128-bit constant.
pub fn load_constant128(to_regs: ValueRegs<Writable<Reg>>, value: u128) -> SmallVec<[Inst; 4]> {
pub fn load_constant128<F: FnMut(Type) -> Writable<Reg>>(
to_regs: ValueRegs<Writable<Reg>>,
value: u128,
mut alloc_tmp: F,
) -> SmallVec<[Inst; 4]> {
assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers");
let lower = value as u64;
@@ -231,8 +248,8 @@ impl Inst {
let lower_reg = to_regs.regs()[0];
let upper_reg = to_regs.regs()[1];
let mut load_ins = Inst::load_constant(lower_reg, lower);
let load_upper = Inst::load_constant(upper_reg, upper);
let mut load_ins = Inst::load_constant(lower_reg, lower, &mut alloc_tmp);
let load_upper = Inst::load_constant(upper_reg, upper, &mut alloc_tmp);
load_ins.extend(load_upper.into_iter());
load_ins
@@ -264,7 +281,7 @@ impl Inst {
}]
} else {
let tmp = alloc_tmp(I32);
let mut insts = Inst::load_constant(tmp, const_data as u64);
let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp);
insts.push(Inst::MovToFpu {
rd,
@@ -304,7 +321,7 @@ impl Inst {
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
} else if const_data & (u32::MAX as u64) == 0 {
let tmp = alloc_tmp(I64);
let mut insts = Inst::load_constant(tmp, const_data);
let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp);
insts.push(Inst::MovToFpu {
rd,
@@ -426,7 +443,7 @@ impl Inst {
smallvec![Inst::VecDupFPImm { rd, imm, size }]
} else {
let tmp = alloc_tmp(I64);
let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]);
insts.push(Inst::VecDup {
rd,
@@ -1212,14 +1229,16 @@ impl MachInst for Inst {
to_regs: ValueRegs<Writable<Reg>>,
value: u128,
ty: Type,
alloc_tmp: F,
mut alloc_tmp: F,
) -> SmallVec<[Inst; 4]> {
let to_reg = to_regs.only_reg();
match ty {
F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp),
F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp),
I8 | I16 | I32 | I64 | R32 | R64 => Inst::load_constant(to_reg.unwrap(), value as u64),
I128 => Inst::load_constant128(to_regs, value),
I8 | I16 | I32 | I64 | R32 | R64 => {
Inst::load_constant(to_reg.unwrap(), value as u64, &mut alloc_tmp)
}
I128 => Inst::load_constant128(to_regs, value, alloc_tmp),
_ => panic!("Cannot generate constant for type: {}", ty),
}
}
@@ -2837,7 +2856,7 @@ impl Inst {
);
} else {
let tmp = writable_spilltmp_reg();
for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
for inst in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
ret.push_str(
&inst.print_with_state(&mut EmitState::default(), &mut empty_allocs),
);

View File

@@ -575,7 +575,7 @@ fn lower_add_immediate(ctx: &mut Lower<Inst>, dst: Writable<Reg>, src: Reg, imm:
}
pub(crate) fn lower_constant_u64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: u64) {
for inst in Inst::load_constant(rd, value) {
for inst in Inst::load_constant(rd, value, &mut |ty| ctx.alloc_tmp(ty).only_reg().unwrap()) {
ctx.emit(inst);
}
}

View File

@@ -3,6 +3,7 @@
// Pull in the ISLE generated code.
pub mod generated_code;
use generated_code::Context;
use smallvec::SmallVec;
// Types that the generated ISLE code uses via `use super::*`.
use super::{
@@ -217,7 +218,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
} else {
value
};
let rd = self.temp_writable_reg(I64);
let size = OperandSize::Size64;
// If the top 32 bits are zero, use 32-bit `mov` operations.
@@ -226,6 +226,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
let lower_halfword = value as u16;
let upper_halfword = (value >> 16) as u16;
let rd = self.temp_writable_reg(I64);
if upper_halfword == u16::MAX {
self.emit(&MInst::MovWide {
op: MoveWideOp::MovN,
@@ -242,17 +243,20 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
});
if upper_halfword != 0 {
let tmp = self.temp_writable_reg(I64);
self.emit(&MInst::MovK {
rd,
rd: tmp,
rn: rd.to_reg(),
imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(),
size,
});
return tmp.to_reg();
}
}
};
return rd.to_reg();
} else if value == u64::MAX {
let rd = self.temp_writable_reg(I64);
self.emit(&MInst::MovWide {
op: MoveWideOp::MovN,
rd,
@@ -265,20 +269,34 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
// If the number of 0xffff half words is greater than the number of 0x0000 half words
// it is more efficient to use `movn` for the first instruction.
let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value);
// Either 0xffff or 0x0000 half words can be skipped, depending on the first
// instruction used.
let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
let mut first_mov_emitted = false;
for i in 0..4 {
let halfwords: SmallVec<[_; 4]> = (0..4)
.filter_map(|i| {
let imm16 = (value >> (16 * i)) & 0xffff;
if imm16 != ignored_halfword {
if !first_mov_emitted {
first_mov_emitted = true;
if imm16 == ignored_halfword {
None
} else {
Some((i, imm16))
}
})
.collect();
let mut prev_result = None;
for (i, imm16) in halfwords {
let shift = i * 16;
let rd = self.temp_writable_reg(I64);
if let Some(rn) = prev_result {
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
self.emit(&MInst::MovK { rd, rn, imm, size });
} else {
if first_is_inverted {
let imm =
MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
.unwrap();
MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift).unwrap();
self.emit(&MInst::MovWide {
op: MoveWideOp::MovN,
rd,
@@ -286,7 +304,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
size,
});
} else {
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
self.emit(&MInst::MovWide {
op: MoveWideOp::MovZ,
rd,
@@ -294,21 +312,14 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
size,
});
}
} else {
let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
self.emit(&MInst::MovK {
rd,
rn: rd.to_reg(),
imm,
size,
});
}
}
}
assert!(first_mov_emitted);
prev_result = Some(rd.to_reg());
}
return self.writable_reg_to_reg(rd);
assert!(prev_result.is_some());
return prev_result.unwrap();
fn count_zero_half_words(mut value: u64) -> usize {
let mut count = 0;

View File

@@ -525,18 +525,18 @@ impl ABIMachineSpec for Riscv64MachineDeps {
insts
}
fn gen_memcpy(
fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
call_conv: isa::CallConv,
dst: Reg,
src: Reg,
tmp: Writable<Reg>,
_tmp2: Writable<Reg>,
size: usize,
mut alloc_tmp: F,
) -> SmallVec<[Self::I; 8]> {
let mut insts = SmallVec::new();
let arg0 = Writable::from_reg(x_reg(10));
let arg1 = Writable::from_reg(x_reg(11));
let arg2 = Writable::from_reg(x_reg(12));
let tmp = alloc_tmp(Self::word_type());
insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter());
insts.push(Inst::Call {
info: Box::new(CallInfo {

View File

@@ -752,13 +752,12 @@ impl ABIMachineSpec for S390xMachineDeps {
unreachable!();
}
fn gen_memcpy(
fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
_call_conv: isa::CallConv,
_dst: Reg,
_src: Reg,
_tmp1: Writable<Reg>,
_tmp2: Writable<Reg>,
_size: usize,
_alloc: F,
) -> SmallVec<[Self::I; 8]> {
unimplemented!("StructArgs not implemented for S390X yet");
}

View File

@@ -624,18 +624,19 @@ impl ABIMachineSpec for X64ABIMachineSpec {
insts
}
fn gen_memcpy(
fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
call_conv: isa::CallConv,
dst: Reg,
src: Reg,
temp: Writable<Reg>,
temp2: Writable<Reg>,
size: usize,
mut alloc_tmp: F,
) -> SmallVec<[Self::I; 8]> {
let mut insts = SmallVec::new();
let arg0 = get_intreg_for_arg(&call_conv, 0, 0).unwrap();
let arg1 = get_intreg_for_arg(&call_conv, 1, 1).unwrap();
let arg2 = get_intreg_for_arg(&call_conv, 2, 2).unwrap();
let temp = alloc_tmp(Self::word_type());
let temp2 = alloc_tmp(Self::word_type());
insts.extend(
Inst::gen_constant(ValueRegs::one(temp), size as u128, I64, |_| {
panic!("tmp should not be needed")

View File

@@ -567,16 +567,14 @@ pub trait ABIMachineSpec {
) -> SmallVec<[Self::I; 2]>;
/// Generate a memcpy invocation. Used to set up struct
/// args. Takes `src`, `dst` as read-only inputs and requires two
/// temporaries to generate the call (for the size immediate and
/// possibly for the address of `memcpy` itself).
fn gen_memcpy(
/// args. Takes `src`, `dst` as read-only inputs and passes a temporary
/// allocator.
fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
call_conv: isa::CallConv,
dst: Reg,
src: Reg,
tmp1: Writable<Reg>,
tmp2: Writable<Reg>,
size: usize,
alloc_tmp: F,
) -> SmallVec<[Self::I; 8]>;
/// Get the number of spillslots required for the given register-class.
@@ -2152,15 +2150,12 @@ impl<M: ABIMachineSpec> Caller<M> {
// arg regs.
let memcpy_call_conv =
isa::CallConv::for_libcall(&self.flags, ctx.sigs()[self.sig].call_conv);
let tmp1 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
let tmp2 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
for insn in M::gen_memcpy(
memcpy_call_conv,
dst_ptr.to_reg(),
src_ptr,
tmp1,
tmp2,
size as usize,
|ty| ctx.alloc_tmp(ty).only_reg().unwrap(),
)
.into_iter()
{

View File

@@ -37,9 +37,9 @@ block0(v0: i32):
}
; block0:
; movz w2, #4369
; movk w2, w2, #17, LSL #16
; subs wzr, w0, w2
; movz w3, #4369
; movk w3, w3, #17, LSL #16
; subs wzr, w0, w3
; cset x0, hs
; ret
@@ -51,9 +51,9 @@ block0(v0: i32):
}
; block0:
; movz w2, #4368
; movk w2, w2, #17, LSL #16
; subs wzr, w0, w2
; movz w3, #4368
; movk w3, w3, #17, LSL #16
; subs wzr, w0, w3
; cset x0, hs
; ret
@@ -89,9 +89,9 @@ block0(v0: i32):
}
; block0:
; movz w2, #4369
; movk w2, w2, #17, LSL #16
; subs wzr, w0, w2
; movz w3, #4369
; movk w3, w3, #17, LSL #16
; subs wzr, w0, w3
; cset x0, ge
; ret
@@ -103,9 +103,9 @@ block0(v0: i32):
}
; block0:
; movz w2, #4368
; movk w2, w2, #17, LSL #16
; subs wzr, w0, w2
; movz w3, #4368
; movk w3, w3, #17, LSL #16
; subs wzr, w0, w3
; cset x0, ge
; ret

View File

@@ -14,11 +14,11 @@ block0(v0: i8x16):
; movk x5, x5, #8208, LSL #32
; movk x5, x5, #32832, LSL #48
; dup v16.2d, x5
; and v19.16b, v2.16b, v16.16b
; ext v21.16b, v19.16b, v19.16b, #8
; zip1 v23.16b, v19.16b, v21.16b
; addv h25, v23.8h
; umov w0, v25.h[0]
; and v22.16b, v2.16b, v16.16b
; ext v24.16b, v22.16b, v22.16b, #8
; zip1 v26.16b, v22.16b, v24.16b
; addv h28, v26.8h
; umov w0, v28.h[0]
; ret
function %f2(i8x16) -> i16 {
@@ -34,11 +34,11 @@ block0(v0: i8x16):
; movk x5, x5, #8208, LSL #32
; movk x5, x5, #32832, LSL #48
; dup v16.2d, x5
; and v19.16b, v2.16b, v16.16b
; ext v21.16b, v19.16b, v19.16b, #8
; zip1 v23.16b, v19.16b, v21.16b
; addv h25, v23.8h
; umov w0, v25.h[0]
; and v22.16b, v2.16b, v16.16b
; ext v24.16b, v22.16b, v22.16b, #8
; zip1 v26.16b, v22.16b, v24.16b
; addv h28, v26.8h
; umov w0, v28.h[0]
; ret
function %f3(i16x8) -> i8 {