Handle spilling i128 arguments into the stack in aarch64

This commit is contained in:
Afonso Bordado
2021-05-12 12:51:45 +01:00
parent ac624da8d9
commit fbcfffdeab
3 changed files with 184 additions and 42 deletions

View File

@@ -187,7 +187,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
let is_baldrdash = call_conv.extends_baldrdash(); let is_baldrdash = call_conv.extends_baldrdash();
let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;
// See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4. // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4.
// //
// MacOS aarch64 is slightly different, see also // MacOS aarch64 is slightly different, see also
// https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms. // https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms.
@@ -265,7 +265,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
param.value_type param.value_type
); );
let (rcs, _) = Inst::rc_for_type(param.value_type)?; let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?;
if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
assert!(rcs[0] == RegClass::I64); assert!(rcs[0] == RegClass::I64);
@@ -288,7 +288,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
// Handle multi register params // Handle multi register params
// //
// See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C). // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#642parameter-passing-rules), (Section 6.4.2 Stage C).
// //
// For arguments with alignment of 16 we round up the the register number // For arguments with alignment of 16 we round up the the register number
// to the next even value. So we can never allocate for example an i128 // to the next even value. So we can never allocate for example an i128
@@ -301,7 +301,11 @@ impl ABIMachineSpec for AArch64MachineDeps {
// restriction of passing the lower half in Xn and the upper half in Xn+1 // restriction of passing the lower half in Xn and the upper half in Xn+1
// (Stage C.9) // (Stage C.9)
// //
// For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh // For examples of how LLVM handles this: https://godbolt.org/z/bhd3vvEfh
//
// On the Apple ABI it is unspecified if we can spill half the value into the stack
// i.e load the lower half into x7 and the upper half into the stack
// LLVM does not seem to do this, so we are going to replicate that behaviour
let is_multi_reg = rcs.len() >= 2; let is_multi_reg = rcs.len() >= 2;
if is_multi_reg { if is_multi_reg {
assert!( assert!(
@@ -348,10 +352,8 @@ impl ABIMachineSpec for AArch64MachineDeps {
remaining_reg_vals -= 2; remaining_reg_vals -= 2;
continue; continue;
} }
} } else {
// Single Register parameters // Single Register parameters
if !is_multi_reg {
let rc = rcs[0]; let rc = rcs[0];
let next_reg = match rc { let next_reg = match rc {
RegClass::I64 => &mut next_xreg, RegClass::I64 => &mut next_xreg,
@@ -400,12 +402,28 @@ impl ABIMachineSpec for AArch64MachineDeps {
debug_assert!(size.is_power_of_two()); debug_assert!(size.is_power_of_two());
next_stack = align_to(next_stack, size); next_stack = align_to(next_stack, size);
ret.push(ABIArg::stack( let slots = reg_types
next_stack as i64, .iter()
param.value_type, .copied()
param.extension, // Build the stack locations from each slot
param.purpose, .scan(next_stack, |next_stack, ty| {
)); let slot_offset = *next_stack as i64;
*next_stack += (ty_bits(ty) / 8) as u64;
Some((ty, slot_offset))
})
.map(|(ty, offset)| ABIArgSlot::Stack {
offset,
ty,
extension: param.extension,
})
.collect();
ret.push(ABIArg::Slots {
slots,
purpose: param.purpose,
});
next_stack += size; next_stack += size;
} }

View File

@@ -158,18 +158,18 @@ impl NarrowValueMode {
} }
} }
/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated /// Emits instruction(s) to generate the given constant value into newly-allocated
/// temporary register, returning that register. /// temporary registers, returning these registers.
fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs<Reg> { fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs<Reg> {
let from_bits = ty_bits(ty); let from_bits = ty_bits(ty);
let masked = if from_bits < 64 { let masked = if from_bits < 128 {
c & ((1u64 << from_bits) - 1) c & ((1u128 << from_bits) - 1)
} else { } else {
c c
}; };
let cst_copy = ctx.alloc_tmp(ty); let cst_copy = ctx.alloc_tmp(ty);
for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| { for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| {
ctx.alloc_tmp(ty).only_reg().unwrap() ctx.alloc_tmp(ty).only_reg().unwrap()
}) })
.into_iter() .into_iter()
@@ -181,7 +181,7 @@ fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Va
/// Extends a register according to `narrow_mode`. /// Extends a register according to `narrow_mode`.
/// If extended, the value is always extended to 64 bits, for simplicity. /// If extended, the value is always extended to 64 bits, for simplicity.
fn narrow_reg<C: LowerCtx<I = Inst>>( fn extend_reg<C: LowerCtx<I = Inst>>(
ctx: &mut C, ctx: &mut C,
ty: Type, ty: Type,
in_reg: Reg, in_reg: Reg,
@@ -252,6 +252,26 @@ fn narrow_reg<C: LowerCtx<I = Inst>>(
} }
} }
/// Lowers an instruction input to multiple regs
fn lower_input_to_regs<C: LowerCtx<I = Inst>>(
ctx: &mut C,
input: InsnInput,
) -> (ValueRegs<Reg>, Type, bool) {
debug!("lower_input_to_regs: input {:?}", input);
let ty = ctx.input_ty(input.insn, input.input);
let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
let is_const = inputs.constant.is_some();
let in_regs = if let Some(c) = inputs.constant {
// Generate constants fresh at each use to minimize long-range register pressure.
generate_constant(ctx, ty, c as u128)
} else {
ctx.put_input_in_regs(input.insn, input.input)
};
(in_regs, ty, is_const)
}
/// Lower an instruction input to a register /// Lower an instruction input to a register
/// ///
/// The given register will be extended appropriately, according to /// The given register will be extended appropriately, according to
@@ -262,17 +282,12 @@ pub(crate) fn put_input_in_reg<C: LowerCtx<I = Inst>>(
input: InsnInput, input: InsnInput,
narrow_mode: NarrowValueMode, narrow_mode: NarrowValueMode,
) -> Reg { ) -> Reg {
let reg = put_input_in_regs(ctx, input) let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input);
let reg = in_regs
.only_reg() .only_reg()
.expect("Multi-register value not expected"); .expect("Multi-register value not expected");
let is_const = ctx extend_reg(ctx, ty, reg, is_const, narrow_mode)
.get_input_as_source_or_const(input.insn, input.input)
.constant
.is_some();
let ty = ctx.input_ty(input.insn, input.input);
narrow_reg(ctx, ty, reg, is_const, narrow_mode)
} }
/// Lower an instruction input to multiple regs /// Lower an instruction input to multiple regs
@@ -280,17 +295,7 @@ pub(crate) fn put_input_in_regs<C: LowerCtx<I = Inst>>(
ctx: &mut C, ctx: &mut C,
input: InsnInput, input: InsnInput,
) -> ValueRegs<Reg> { ) -> ValueRegs<Reg> {
debug!("put_input_in_reg: input {:?}", input); let (in_regs, _, _) = lower_input_to_regs(ctx, input);
let ty = ctx.input_ty(input.insn, input.input);
let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
let in_regs = if let Some(c) = inputs.constant {
// Generate constants fresh at each use to minimize long-range register pressure.
generate_constant(ctx, ty, c)
} else {
ctx.put_input_in_regs(input.insn, input.input)
};
in_regs in_regs
} }

View File

@@ -287,7 +287,7 @@ block0(v0: i64):
; nextln: ret ; nextln: ret
; The aarch64 abi requires that the i128 argument be aligned ; The AArch64 ABI requires that the i128 argument be aligned
; and to be passed in x2 and x3 ; and to be passed in x2 and x3
function %f12(i64, i128) -> i64 { function %f12(i64, i128) -> i64 {
block0(v0: i64, v1: i128): block0(v0: i64, v1: i128):
@@ -325,7 +325,7 @@ block0(v0: i64):
; aarch64 allows the i128 argument to not be aligned ; The Apple AArch64 ABI allows the i128 argument to not be aligned
; and to be passed in x1 and x2 ; and to be passed in x1 and x2
function %f13(i64, i128) -> i64 apple_aarch64 { function %f13(i64, i128) -> i64 apple_aarch64 {
block0(v0: i64, v1: i128): block0(v0: i64, v1: i128):
@@ -360,3 +360,122 @@ block0(v0: i64):
; nextln: ldp fp, lr, [sp], #16 ; nextln: ldp fp, lr, [sp], #16
; nextln: ret ; nextln: ret
; We only have 8 registers to pass data in
; make sure we spill the last argument even though there is one slot available
function %f14(i128, i128, i128, i64, i128) -> i128 {
block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
return v4
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldur x0, [fp, #16]
; nextln: ldur x1, [fp, #24]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f14_call(i128, i64) -> i128 {
fn0 = %f14(i128, i128, i128, i64, i128) -> i128
block0(v0: i128, v1: i64):
v2 = call fn0(v0, v0, v0, v1, v0)
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; TODO: Some codegen optimization possible here with x0,x1 moving to x7,x8 and then moving back
; nextln: mov x7, x0
; nextln: mov x8, x1
; nextln: mov x6, x2
; nextln: sub sp, sp, #16
; nextln: virtual_sp_offset_adjust 16
; nextln: mov x0, x7
; nextln: mov x1, x8
; nextln: mov x2, x7
; nextln: mov x3, x8
; nextln: mov x4, x7
; nextln: mov x5, x8
; nextln: stur x7, [sp]
; nextln: stur x8, [sp, #8]
; nextln: ldr x7, 8 ; b 12 ; data
; nextln: blr x7
; nextln: add sp, sp, #16
; nextln: virtual_sp_offset_adjust -16
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
; We have one register slot available (Similar to %f14), however apple
; allows us to start i128 on non even numbered registers (x7 in this case).
;
; It is unspecified if we can split the i128 into x7 + the stack.
; In practice LLVM does not do this, so we are going to go with that.
function %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64{
block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128):
return v4
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: ldur x0, [fp, #16]
; nextln: ldur x1, [fp, #24]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f15_call(i128, i64) -> i128 apple_aarch64 {
fn0 = %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64
block0(v0: i128, v1: i64):
v2 = call fn0(v0, v0, v0, v1, v0)
return v2
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: mov x7, x0
; nextln: mov x8, x1
; nextln: mov x6, x2
; nextln: sub sp, sp, #16
; nextln: virtual_sp_offset_adjust 16
; nextln: mov x0, x7
; nextln: mov x1, x8
; nextln: mov x2, x7
; nextln: mov x3, x8
; nextln: mov x4, x7
; nextln: mov x5, x8
; nextln: stur x7, [sp]
; nextln: stur x8, [sp, #8]
; nextln: ldr x7, 8 ; b 12 ; data
; nextln: blr x7
; nextln: add sp, sp, #16
; nextln: virtual_sp_offset_adjust -16
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %f16() -> i32, i32 wasmtime_system_v {
block0:
v0 = iconst.i32 0
v1 = iconst.i32 1
return v0, v1
}
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: mov x1, x0
; nextln: movz x0, #0
; nextln: movz x2, #1
; nextln: stur w2, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret