From fbcfffdeab81274fea944e866713872f0d72b2b5 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 12 May 2021 12:51:45 +0100 Subject: [PATCH] Handle spilling i128 arguments into the stack in aarch64 --- cranelift/codegen/src/isa/aarch64/abi.rs | 46 +++++-- cranelift/codegen/src/isa/aarch64/lower.rs | 57 ++++---- .../filetests/filetests/isa/aarch64/call.clif | 123 +++++++++++++++++- 3 files changed, 184 insertions(+), 42 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index f9e0de1498..f521087565 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -187,7 +187,7 @@ impl ABIMachineSpec for AArch64MachineDeps { let is_baldrdash = call_conv.extends_baldrdash(); let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; - // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4. + // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4. // // MacOS aarch64 is slightly different, see also // https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms. @@ -265,7 +265,7 @@ impl ABIMachineSpec for AArch64MachineDeps { param.value_type ); - let (rcs, _) = Inst::rc_for_type(param.value_type)?; + let (rcs, reg_types) = Inst::rc_for_type(param.value_type)?; if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { assert!(rcs[0] == RegClass::I64); @@ -288,7 +288,7 @@ impl ABIMachineSpec for AArch64MachineDeps { // Handle multi register params // - // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C). + // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#642parameter-passing-rules), (Section 6.4.2 Stage C). // // For arguments with alignment of 16 we round up the the register number // to the next even value. So we can never allocate for example an i128 @@ -301,7 +301,11 @@ impl ABIMachineSpec for AArch64MachineDeps { // restriction of passing the lower half in Xn and the upper half in Xn+1 // (Stage C.9) // - // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh + // For examples of how LLVM handles this: https://godbolt.org/z/bhd3vvEfh + // + // On the Apple ABI it is unspecified if we can spill half the value into the stack + // i.e load the lower half into x7 and the upper half into the stack + // LLVM does not seem to do this, so we are going to replicate that behaviour let is_multi_reg = rcs.len() >= 2; if is_multi_reg { assert!( @@ -348,10 +352,8 @@ impl ABIMachineSpec for AArch64MachineDeps { remaining_reg_vals -= 2; continue; } - } - - // Single Register parameters - if !is_multi_reg { + } else { + // Single Register parameters let rc = rcs[0]; let next_reg = match rc { RegClass::I64 => &mut next_xreg, @@ -400,12 +402,28 @@ impl ABIMachineSpec for AArch64MachineDeps { debug_assert!(size.is_power_of_two()); next_stack = align_to(next_stack, size); - ret.push(ABIArg::stack( - next_stack as i64, - param.value_type, - param.extension, - param.purpose, - )); + let slots = reg_types + .iter() + .copied() + // Build the stack locations from each slot + .scan(next_stack, |next_stack, ty| { + let slot_offset = *next_stack as i64; + *next_stack += (ty_bits(ty) / 8) as u64; + + Some((ty, slot_offset)) + }) + .map(|(ty, offset)| ABIArgSlot::Stack { + offset, + ty, + extension: param.extension, + }) + .collect(); + + ret.push(ABIArg::Slots { + slots, + purpose: param.purpose, + }); + next_stack += size; } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 7c1cd5f055..d07311159e 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -158,18 +158,18 @@ impl NarrowValueMode { } } -/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated -/// temporary register, returning that register. -fn generate_constant>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs { +/// Emits instruction(s) to generate the given constant value into newly-allocated +/// temporary registers, returning these registers. +fn generate_constant>(ctx: &mut C, ty: Type, c: u128) -> ValueRegs { let from_bits = ty_bits(ty); - let masked = if from_bits < 64 { - c & ((1u64 << from_bits) - 1) + let masked = if from_bits < 128 { + c & ((1u128 << from_bits) - 1) } else { c }; let cst_copy = ctx.alloc_tmp(ty); - for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| { + for inst in Inst::gen_constant(cst_copy, masked, ty, |ty| { ctx.alloc_tmp(ty).only_reg().unwrap() }) .into_iter() @@ -181,7 +181,7 @@ fn generate_constant>(ctx: &mut C, ty: Type, c: u64) -> Va /// Extends a register according to `narrow_mode`. /// If extended, the value is always extended to 64 bits, for simplicity. -fn narrow_reg>( +fn extend_reg>( ctx: &mut C, ty: Type, in_reg: Reg, @@ -252,6 +252,26 @@ fn narrow_reg>( } } +/// Lowers an instruction input to multiple regs +fn lower_input_to_regs>( + ctx: &mut C, + input: InsnInput, +) -> (ValueRegs, Type, bool) { + debug!("lower_input_to_regs: input {:?}", input); + let ty = ctx.input_ty(input.insn, input.input); + let inputs = ctx.get_input_as_source_or_const(input.insn, input.input); + let is_const = inputs.constant.is_some(); + + let in_regs = if let Some(c) = inputs.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + generate_constant(ctx, ty, c as u128) + } else { + ctx.put_input_in_regs(input.insn, input.input) + }; + + (in_regs, ty, is_const) +} + /// Lower an instruction input to a register /// /// The given register will be extended appropriately, according to @@ -262,17 +282,12 @@ pub(crate) fn put_input_in_reg>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> Reg { - let reg = put_input_in_regs(ctx, input) + let (in_regs, ty, is_const) = lower_input_to_regs(ctx, input); + let reg = in_regs .only_reg() .expect("Multi-register value not expected"); - let is_const = ctx - .get_input_as_source_or_const(input.insn, input.input) - .constant - .is_some(); - - let ty = ctx.input_ty(input.insn, input.input); - narrow_reg(ctx, ty, reg, is_const, narrow_mode) + extend_reg(ctx, ty, reg, is_const, narrow_mode) } /// Lower an instruction input to multiple regs @@ -280,17 +295,7 @@ pub(crate) fn put_input_in_regs>( ctx: &mut C, input: InsnInput, ) -> ValueRegs { - debug!("put_input_in_reg: input {:?}", input); - let ty = ctx.input_ty(input.insn, input.input); - let inputs = ctx.get_input_as_source_or_const(input.insn, input.input); - - let in_regs = if let Some(c) = inputs.constant { - // Generate constants fresh at each use to minimize long-range register pressure. - generate_constant(ctx, ty, c) - } else { - ctx.put_input_in_regs(input.insn, input.input) - }; - + let (in_regs, _, _) = lower_input_to_regs(ctx, input); in_regs } diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif index 090c220167..9491c73b75 100644 --- a/cranelift/filetests/filetests/isa/aarch64/call.clif +++ b/cranelift/filetests/filetests/isa/aarch64/call.clif @@ -287,7 +287,7 @@ block0(v0: i64): ; nextln: ret -; The aarch64 abi requires that the i128 argument be aligned +; The AArch64 ABI requires that the i128 argument be aligned ; and to be passed in x2 and x3 function %f12(i64, i128) -> i64 { block0(v0: i64, v1: i128): @@ -325,7 +325,7 @@ block0(v0: i64): -; aarch64 allows the i128 argument to not be aligned +; The Apple AArch64 ABI allows the i128 argument to not be aligned ; and to be passed in x1 and x2 function %f13(i64, i128) -> i64 apple_aarch64 { block0(v0: i64, v1: i128): @@ -360,3 +360,122 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + + +; We only have 8 registers to pass data in +; make sure we spill the last argument even though there is one slot available +function %f14(i128, i128, i128, i64, i128) -> i128 { +block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128): + return v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldur x0, [fp, #16] +; nextln: ldur x1, [fp, #24] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f14_call(i128, i64) -> i128 { + fn0 = %f14(i128, i128, i128, i64, i128) -> i128 + +block0(v0: i128, v1: i64): + v2 = call fn0(v0, v0, v0, v1, v0) + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp + +; TODO: Some codegen optimization possible here with x0,x1 moving to x7,x8 and then moving back +; nextln: mov x7, x0 +; nextln: mov x8, x1 +; nextln: mov x6, x2 +; nextln: sub sp, sp, #16 +; nextln: virtual_sp_offset_adjust 16 +; nextln: mov x0, x7 +; nextln: mov x1, x8 +; nextln: mov x2, x7 +; nextln: mov x3, x8 +; nextln: mov x4, x7 +; nextln: mov x5, x8 +; nextln: stur x7, [sp] +; nextln: stur x8, [sp, #8] + +; nextln: ldr x7, 8 ; b 12 ; data +; nextln: blr x7 +; nextln: add sp, sp, #16 +; nextln: virtual_sp_offset_adjust -16 + +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +; We have one register slot available (Similar to %f14), however apple +; allows us to start i128 on non even numbered registers (x7 in this case). +; +; It is unspecified if we can split the i128 into x7 + the stack. +; In practice LLVM does not do this, so we are going to go with that. +function %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64{ +block0(v0: i128, v1: i128, v2: i128, v3: i64, v4: i128): + return v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldur x0, [fp, #16] +; nextln: ldur x1, [fp, #24] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f15_call(i128, i64) -> i128 apple_aarch64 { + fn0 = %f15(i128, i128, i128, i64, i128) -> i128 apple_aarch64 + +block0(v0: i128, v1: i64): + v2 = call fn0(v0, v0, v0, v1, v0) + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp + +; nextln: mov x7, x0 +; nextln: mov x8, x1 +; nextln: mov x6, x2 +; nextln: sub sp, sp, #16 +; nextln: virtual_sp_offset_adjust 16 +; nextln: mov x0, x7 +; nextln: mov x1, x8 +; nextln: mov x2, x7 +; nextln: mov x3, x8 +; nextln: mov x4, x7 +; nextln: mov x5, x8 +; nextln: stur x7, [sp] +; nextln: stur x8, [sp, #8] + +; nextln: ldr x7, 8 ; b 12 ; data +; nextln: blr x7 +; nextln: add sp, sp, #16 +; nextln: virtual_sp_offset_adjust -16 + +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %f16() -> i32, i32 wasmtime_system_v { +block0: + v0 = iconst.i32 0 + v1 = iconst.i32 1 + return v0, v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: movz x0, #0 +; nextln: movz x2, #1 +; nextln: stur w2, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret +