From eec60c9b06bed1a853b1ef1805c9731377bbffbf Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Fri, 2 Oct 2020 16:22:55 +0100 Subject: [PATCH] arm64: Use SignedOffset rather than PreIndexed addressing mode for callee-saved registers This also passes `fixed_frame_storage_size` (previously `total_sp_adjust`) into `gen_clobber_save` so that it can be combined with other stack adjustments. Copyright (c) 2020, Arm Limited. --- cranelift/codegen/src/isa/aarch64/abi.rs | 98 +++++++++++-------- cranelift/codegen/src/isa/x64/abi.rs | 6 +- cranelift/codegen/src/machinst/abi_impl.rs | 19 ++-- .../filetests/filetests/isa/aarch64/call.clif | 3 - .../filetests/isa/aarch64/reftypes.clif | 6 +- 5 files changed, 73 insertions(+), 59 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 5d883f1f30..03c6622f12 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -129,6 +129,18 @@ impl Into for StackAMode { } } +// Returns the size of stack space needed to store the +// `int_reg` and `vec_reg`. +fn saved_reg_stack_size( + int_reg: &[Writable], + vec_reg: &[Writable], +) -> (usize, usize) { + // Round up to multiple of 2, to keep 16-byte stack alignment. + let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8; + let vec_save_bytes = vec_reg.len() * 16; + (int_save_bytes, vec_save_bytes) +} + /// AArch64-specific ABI behavior. This struct just serves as an implementation /// point for the trait; it is never actually instantiated. pub(crate) struct AArch64MachineDeps; @@ -495,11 +507,18 @@ impl ABIMachineSpec for AArch64MachineDeps { call_conv: isa::CallConv, _: &settings::Flags, clobbers: &Set>, + fixed_frame_storage_size: u32, ) -> (u64, SmallVec<[Inst; 16]>) { let mut insts = SmallVec::new(); let (clobbered_int, clobbered_vec) = get_callee_saves(call_conv, clobbers); - let mut clobber_size = 0; - for reg_pair in clobbered_int.chunks(2) { + + let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec); + let total_save_bytes = (vec_save_bytes + int_save_bytes) as i32; + insts.extend(Self::gen_sp_reg_adjust( + -(total_save_bytes + fixed_frame_storage_size as i32), + )); + + for (i, reg_pair) in clobbered_int.chunks(2).enumerate() { let (r1, r2) = if reg_pair.len() == 2 { // .to_reg().to_reg(): Writable --> RealReg --> Reg (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg()) @@ -510,36 +529,30 @@ impl ABIMachineSpec for AArch64MachineDeps { debug_assert!(r1.get_class() == RegClass::I64); debug_assert!(r2.get_class() == RegClass::I64); - // stp r1, r2, [sp, #-16]! + // stp r1, r2, [sp, #(i * #16)] insts.push(Inst::StoreP64 { rt: r1, rt2: r2, - mem: PairAMode::PreIndexed( - writable_stack_reg(), - SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(), ), }); - clobber_size += 16; - } - let vec_save_bytes = clobbered_vec.len() * 16; - if vec_save_bytes != 0 { - insts.push(Inst::AluRRImm12 { - alu_op: ALUOp::Sub64, - rd: writable_stack_reg(), - rn: stack_reg(), - imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(), - }); - clobber_size += vec_save_bytes; } + + let vec_offset = int_save_bytes; for (i, reg) in clobbered_vec.iter().enumerate() { insts.push(Inst::FpuStore128 { rd: reg.to_reg().to_reg(), - mem: AMode::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()), + mem: AMode::Unscaled( + stack_reg(), + SImm9::maybe_from_i64((vec_offset + (i * 16)) as i64).unwrap(), + ), srcloc: None, }); } - (clobber_size as u64, insts) + (total_save_bytes as u64, insts) } fn gen_clobber_restore( @@ -550,24 +563,8 @@ impl ABIMachineSpec for AArch64MachineDeps { let mut insts = SmallVec::new(); let (clobbered_int, clobbered_vec) = get_callee_saves(call_conv, clobbers); - for (i, reg) in clobbered_vec.iter().enumerate() { - insts.push(Inst::FpuLoad128 { - rd: Writable::from_reg(reg.to_reg().to_reg()), - mem: AMode::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()), - srcloc: None, - }); - } - let vec_save_bytes = clobbered_vec.len() * 16; - if vec_save_bytes != 0 { - insts.push(Inst::AluRRImm12 { - alu_op: ALUOp::Add64, - rd: writable_stack_reg(), - rn: stack_reg(), - imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(), - }); - } - - for reg_pair in clobbered_int.chunks(2).rev() { + let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec); + for (i, reg_pair) in clobbered_int.chunks(2).enumerate() { let (r1, r2) = if reg_pair.len() == 2 { ( reg_pair[0].map(|r| r.to_reg()), @@ -580,17 +577,36 @@ impl ABIMachineSpec for AArch64MachineDeps { debug_assert!(r1.to_reg().get_class() == RegClass::I64); debug_assert!(r2.to_reg().get_class() == RegClass::I64); - // ldp r1, r2, [sp], #16 + // ldp r1, r2, [sp, #(i * 16)] insts.push(Inst::LoadP64 { rt: r1, rt2: r2, - mem: PairAMode::PostIndexed( - writable_stack_reg(), - SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(), + mem: PairAMode::SignedOffset( + stack_reg(), + SImm7Scaled::maybe_from_i64((i * 16) as i64, types::I64).unwrap(), ), }); } + for (i, reg) in clobbered_vec.iter().enumerate() { + insts.push(Inst::FpuLoad128 { + rd: Writable::from_reg(reg.to_reg().to_reg()), + mem: AMode::Unscaled( + stack_reg(), + SImm9::maybe_from_i64(((i * 16) + int_save_bytes) as i64).unwrap(), + ), + srcloc: None, + }); + } + + // For non-baldrdash calling conventions, the frame pointer + // will be moved into the stack pointer in the epilogue, so we + // can skip restoring the stack pointer value with this `add`. + if call_conv.extends_baldrdash() { + let total_save_bytes = (int_save_bytes + vec_save_bytes) as i32; + insts.extend(Self::gen_sp_reg_adjust(total_save_bytes)); + } + // If this is Baldrdash-2020, restore the callee (i.e., our) TLS // register. We may have allocated it for something else and clobbered // it, but the ABI expects us to leave the TLS register unchanged. diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index ffa14c218a..2c1404937c 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -389,13 +389,15 @@ impl ABIMachineSpec for X64ABIMachineSpec { call_conv: isa::CallConv, _: &settings::Flags, clobbers: &Set>, + fixed_frame_storage_size: u32, ) -> (u64, SmallVec<[Self::I; 16]>) { let mut insts = SmallVec::new(); // Find all clobbered registers that are callee-save. These are only I64 // registers (all XMM registers are caller-save) so we can compute the // total size of the needed stack space easily. let clobbered = get_callee_saves(&call_conv, clobbers); - let stack_size = 8 * clobbered.len() as u32; + let clobbered_size = 8 * clobbered.len() as u32; + let stack_size = clobbered_size + fixed_frame_storage_size; // Align to 16 bytes. let stack_size = (stack_size + 15) & !15; // Adjust the stack pointer downward with one `sub rsp, IMM` @@ -428,7 +430,7 @@ impl ABIMachineSpec for X64ABIMachineSpec { } } - (stack_size as u64, insts) + (clobbered_size as u64, insts) } fn gen_clobber_restore( diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs index 632fb44063..ed7a8f1448 100644 --- a/cranelift/codegen/src/machinst/abi_impl.rs +++ b/cranelift/codegen/src/machinst/abi_impl.rs @@ -325,6 +325,7 @@ pub trait ABIMachineSpec { call_conv: isa::CallConv, flags: &settings::Flags, clobbers: &Set>, + fixed_frame_storage_size: u32, ) -> (u64, SmallVec<[Self::I; 16]>); /// Generate a clobber-restore sequence. This sequence should perform the @@ -931,7 +932,7 @@ impl ABICallee for ABICalleeImpl { let mask = 2 * bytes - 1; let total_stacksize = (total_stacksize + mask) & !mask; // 16-align the stack. - let mut total_sp_adjust = 0; + let mut fixed_frame_storage_size = 0; if !self.call_conv.extends_baldrdash() { // Leaf functions with zero stack don't need a stack check if one's @@ -943,7 +944,7 @@ impl ABICallee for ABICalleeImpl { } } if total_stacksize > 0 { - total_sp_adjust += total_stacksize as u64; + fixed_frame_storage_size += total_stacksize; } } @@ -957,15 +958,13 @@ impl ABICallee for ABICalleeImpl { // [crate::machinst::abi_impl](this module) for more details on // stackframe layout and nominal SP maintenance. - if total_sp_adjust > 0 { - // sub sp, sp, #total_stacksize - let adj = total_sp_adjust as i32; - insts.extend(M::gen_sp_reg_adjust(-adj)); - } - // Save clobbered registers. - let (clobber_size, clobber_insts) = - M::gen_clobber_save(self.call_conv, &self.flags, &self.clobbered); + let (clobber_size, clobber_insts) = M::gen_clobber_save( + self.call_conv, + &self.flags, + &self.clobbered, + fixed_frame_storage_size, + ); insts.extend(clobber_insts); if clobber_size > 0 { diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif index 579c993b19..f08b7c23fc 100644 --- a/cranelift/filetests/filetests/isa/aarch64/call.clif +++ b/cranelift/filetests/filetests/isa/aarch64/call.clif @@ -179,7 +179,6 @@ block0: ; nextln: ldr q8, [sp] ; nextln: ldr q9, [sp, #16] ; nextln: ldr q10, [sp, #32] -; nextln: add sp, sp, #48 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -230,7 +229,6 @@ block0: ; nextln: ldr q8, [sp] ; nextln: ldr q9, [sp, #16] ; nextln: ldr q10, [sp, #32] -; nextln: add sp, sp, #48 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -285,7 +283,6 @@ block0: ; nextln: ldr q8, [sp] ; nextln: ldr q9, [sp, #16] ; nextln: ldr q10, [sp, #32] -; nextln: add sp, sp, #48 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret diff --git a/cranelift/filetests/filetests/isa/aarch64/reftypes.clif b/cranelift/filetests/filetests/isa/aarch64/reftypes.clif index 97234a7da0..c1775802fc 100644 --- a/cranelift/filetests/filetests/isa/aarch64/reftypes.clif +++ b/cranelift/filetests/filetests/isa/aarch64/reftypes.clif @@ -77,8 +77,8 @@ block3(v7: r64, v8: r64): ; check: Block 0: ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: sub sp, sp, #32 -; nextln: stp x19, x20, [sp, #-16]! +; nextln: sub sp, sp, #48 +; nextln: stp x19, x20, [sp] ; nextln: virtual_sp_offset_adjust 16 ; nextln: mov x19, x0 ; nextln: mov x20, x1 @@ -111,7 +111,7 @@ block3(v7: r64, v8: r64): ; nextln: ldr x1, [x1] ; nextln: mov x2, x1 ; nextln: mov x1, x19 -; nextln: ldp x19, x20, [sp], #16 +; nextln: ldp x19, x20, [sp] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret