From 78c3091e84d1c3c5fa4ac4a34c404ed6ea0125e6 Mon Sep 17 00:00:00 2001 From: Peter Huene Date: Wed, 20 May 2020 00:42:44 -0700 Subject: [PATCH 1/4] Fix FPR saving and shadow space allocation for Windows x64. This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475. --- cranelift/codegen/src/isa/x86/abi.rs | 250 +++++++++--------- .../codegen/src/isa/x86/unwind/winx64.rs | 151 ++++------- .../isa/x86/windows_fastcall_x64.clif | 188 +++++++++++-- .../isa/x86/windows_fastcall_x64_unwind.clif | 54 ++-- 4 files changed, 380 insertions(+), 263 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 85ecb0c2a9..bc7eb94f0d 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -6,7 +6,6 @@ use super::settings as isa_settings; use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; use crate::cursor::{Cursor, CursorPosition, EncCursor}; use crate::ir; -use crate::ir::entities::StackSlot; use crate::ir::immediates::Imm64; use crate::ir::stackslot::{StackOffset, StackSize}; use crate::ir::types; @@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet; use crate::result::CodegenResult; use crate::stack_layout::layout_stack; use alloc::borrow::Cow; -use alloc::vec::Vec; use core::i32; use target_lexicon::{PointerWidth, Triple}; @@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax]; /// /// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling /// convention reserves spill space for parameters, you don’t have to use them as such" -const WIN_SHADOW_STACK_SPACE: i32 = 32; +const WIN_SHADOW_STACK_SPACE: StackSize = 32; /// Stack alignment requirement for functions. /// @@ -87,7 +85,7 @@ impl Args { WIN_SHADOW_STACK_SPACE } else { 0 - } as u32; + }; Self { pointer_bytes: bits / 8, @@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> let word_size = StackSize::from(isa.pointer_bytes()); let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() { - WIN_SHADOW_STACK_SPACE as u32 + WIN_SHADOW_STACK_SPACE } else { 0 }; @@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C panic!("TODO: windows-fastcall: x86-32 not implemented yet"); } - let csrs = callee_saved_regs_used(isa, func); - // The reserved stack area is composed of: - // return address + frame pointer + all callee-saved registers + shadow space + // return address + frame pointer + all callee-saved registers // // Pushing the return address is an implicit function of the `call` // instruction. Each of the others we will then push explicitly. Then we // will adjust the stack pointer to make room for the rest of the required // space for this frame. - let word_size = isa.pointer_bytes() as usize; - let num_fprs = csrs.iter(FPR).len(); - let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + let csrs = callee_saved_regs_used(isa, func); + let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32; + let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32; + let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size; - // Only create an FPR stack slot if we're going to save FPRs. - let fpr_slot = if num_fprs > 0 { - // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it - // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be - // through `stack_load` and `stack_store` (see later comment about issue #1198). Even - // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc - // implies that `SpillSlot` may be eligible for certain optimizations, and we know with - // certainty that this space may not be reused in the function, nor moved around. - Some(func.create_stack_slot(ir::StackSlotData { - kind: ir::StackSlotKind::ExplicitSlot, - size: (num_fprs * types::F64X2.bytes() as usize) as u32, - offset: None, - })) - } else { - None - }; + // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed + if fpsr_stack_size > 0 { + csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32; + } - // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work - // since cranelift does not support spill slots before incoming args func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, - size: csr_stack_size as u32, - offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)), + size: csr_stack_size, + offset: Some(-(csr_stack_size as StackOffset)), }); let is_leaf = func.is_leaf(); + + // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space + if !is_leaf { + // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::ExplicitSlot, + size: WIN_SHADOW_STACK_SPACE, + offset: None, + }); + } + let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32; - let local_stack_size = i64::from(total_stack_size - csr_stack_size); + + // Subtract the GPR saved register size from the local size because pushes are used for the saves + let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32); // Add CSRs to function signature let reg_type = isa.pointer_type(); + let sp_arg_index = if fpsr_stack_size > 0 { + let sp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::CalleeSaved, + RU::rsp as RegUnit, + ); + let index = func.signature.params.len(); + func.signature.params.push(sp_arg); + Some(index) + } else { + None + }; let fp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::FramePointer, @@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - fpr_slot.as_ref(), + sp_arg_index.is_some(), isa, ); @@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - fpr_slot.as_ref(), + sp_arg_index, + isa, ); Ok(()) @@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Add CSRs to function signature let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap(); - if isa.pointer_bits() == 32 { + // On X86-32 all parameters, including vmctx, are passed on stack, and we need + // to extract vmctx from the stack before we can save the frame pointer. + let sp_arg_index = if isa.pointer_bits() == 32 { let sp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::CalleeSaved, RU::rsp as RegUnit, ); + let index = func.signature.params.len(); func.signature.params.push(sp_arg); - } + Some(index) + } else { + None + }; let fp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::FramePointer, @@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Set up the cursor and insert the prologue let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa); + insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index.is_some(), + isa, + ); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); - insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None); + insert_common_epilogues( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index, + isa, + ); Ok(()) } @@ -690,12 +719,10 @@ fn insert_common_prologue( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + has_sp_param: bool, isa: &dyn TargetIsa, ) { - // On X86-32 all parameters, including vmctx, are passed on stack, and we need - // to extract vmctx from the stack before we can save the frame pointer. - let sp = if isa.pointer_bits() == 32 { + let sp = if has_sp_param { let block = pos.current_block().expect("missing block under cursor"); let sp = pos.func.dfg.append_block_param(block, reg_type); pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit); @@ -799,38 +826,27 @@ fn insert_common_prologue( } } - // Now that RSP is prepared for the function, we can use stack slots: + // With the stack pointer adjusted, save any callee-saved floating point registers via offset + // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes let mut last_fpr_save = None; - if let Some(fpr_slot) = fpr_slot { - debug_assert!(csrs.iter(FPR).len() != 0); - // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + for (i, reg) in csrs.iter(FPR).enumerate() { + // Append param to entry block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); - // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at - // function entry. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + // Since regalloc has already run, we must assign a location. + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - let mut fpr_offset = 0; + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size & isa.pointer_bytes() as i64); - for reg in csrs.iter(FPR) { - // Append param to entry Block - let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); - - // Since regalloc has already run, we must assign a location. - pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - - last_fpr_save = - Some( - pos.ins() - .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset), - ); - - fpr_offset += types::F64X2.bytes() as i32; - } + last_fpr_save = Some(pos.ins().store( + ir::MemFlags::trusted(), + csr_arg, + sp.expect("FPR save requires SP param"), + (stack_size - offset) as i32, + )); } pos.func.prologue_end = Some( @@ -966,13 +982,14 @@ fn insert_common_epilogues( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + sp_arg_index: Option, + isa: &dyn TargetIsa, ) { while let Some(block) = pos.next_block() { pos.goto_last_inst(block); if let Some(inst) = pos.current_inst() { if pos.func.dfg[inst].opcode().is_return() { - insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot); + insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa); } } } @@ -986,56 +1003,9 @@ fn insert_common_epilogue( pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + sp_arg_index: Option, + isa: &dyn TargetIsa, ) { - // Even though instructions to restore FPRs are inserted first, we have to append them after - // restored GPRs to satisfy parameter order in the return. - let mut restored_fpr_values = Vec::new(); - - // Restore FPRs before we move RSP and invalidate stack slots. - let mut first_fpr_load = None; - if let Some(fpr_slot) = fpr_slot { - debug_assert!(csrs.iter(FPR).len() != 0); - - // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - // - // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); - - first_fpr_load.get_or_insert(pos.current_inst().expect("current inst")); - - // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at - // function exit. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); - - let mut fpr_offset = 0; - - for reg in csrs.iter(FPR) { - let value = pos.ins().load( - types::F64X2, - ir::MemFlags::trusted(), - stack_addr, - fpr_offset, - ); - fpr_offset += types::F64X2.bytes() as i32; - - // Unlike GPRs before, we don't need to step back after reach restoration because FPR - // restoration is order-insensitive. Furthermore: we want GPR restoration to begin - // after FPR restoration, so that stack adjustments occur after we're done relying on - // StackSlot validity. - - pos.func.locations[value] = ir::ValueLoc::Reg(reg); - restored_fpr_values.push(value); - } - } - - let mut sp_adjust_inst = None; - if stack_size > 0 { - sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size))); - } - // Insert the pop of the frame pointer let fp_pop = pos.ins().x86_pop(reg_type); let fp_pop_inst = pos.prev_inst().unwrap(); @@ -1046,13 +1016,47 @@ fn insert_common_epilogue( let mut first_csr_pop_inst = None; for reg in csrs.iter(GPR) { let csr_pop = pos.ins().x86_pop(reg_type); - first_csr_pop_inst = Some(pos.prev_inst().unwrap()); + first_csr_pop_inst = pos.prev_inst(); + assert!(first_csr_pop_inst.is_some()); pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg); pos.func.dfg.append_inst_arg(inst, csr_pop); } - for value in restored_fpr_values.into_iter() { - pos.func.dfg.append_inst_arg(inst, value); + // Insert the adjustment of SP + let mut sp_adjust_inst = None; + if stack_size > 0 { + pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)); + sp_adjust_inst = pos.prev_inst(); + assert!(sp_adjust_inst.is_some()); + } + + let mut first_fpr_load = None; + if let Some(index) = sp_arg_index { + let sp = pos + .func + .dfg + .block_params(pos.func.layout.entry_block().unwrap())[index]; + + // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads) + for (i, reg) in csrs.iter(FPR).enumerate() { + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size & isa.pointer_bytes() as i64); + + let value = pos.ins().load( + types::F64X2, + ir::MemFlags::trusted(), + sp, + (stack_size - offset) as i32, + ); + + first_fpr_load.get_or_insert(pos.current_inst().expect("current inst")); + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, value); + } + } else { + assert!(csrs.iter(FPR).len() == 0); } pos.func.epilogues_start.push( diff --git a/cranelift/codegen/src/isa/x86/unwind/winx64.rs b/cranelift/codegen/src/isa/x86/unwind/winx64.rs index 60aff23f19..4a9af95be9 100644 --- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs +++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs @@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info( let mut prologue_size = 0; let mut unwind_codes = Vec::new(); let mut found_end = false; - - // Have we saved at least one FPR? if so, we might have to check additional constraints. - let mut saved_fpr = false; - - // In addition to the min offset for a callee-save, we need to know the offset from the - // frame base to the stack pointer, so that we can record an unwind offset that spans only - // to the end of callee-save space. - let mut static_frame_allocation_size = 0u32; - - // For the time being, FPR preservation is split into a stack_addr and later store/load. - // Store the register used for stack store and ensure it is the same register with no - // intervening changes to the frame size. - let mut callee_save_region_reg = None; - // Also record the callee-save region's offset from RSP, because it must be added to FPR - // save offsets to compute an offset from the frame base. - let mut callee_save_offset = None; + let mut xmm_save_count: u8 = 0; for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length @@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info( InstructionData::Unary { opcode, arg } => { match opcode { Opcode::X86Push => { - static_frame_allocation_size += 8; - unwind_codes.push(UnwindCode::PushRegister { offset: unwind_offset, reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8, @@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info( Opcode::AdjustSpDown => { let stack_size = stack_size.expect("expected a previous stack size instruction"); - static_frame_allocation_size += stack_size; // This is used when calling a stack check function // We need to track the assignment to RAX which has the size of the stack @@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info( InstructionData::CopySpecial { src, dst, .. } => { if let Some(frame_register) = frame_register { if src == (RU::rsp as RegUnit) && dst == frame_register { - // Constructing an rbp-based stack frame, so the static frame - // allocation restarts at 0 from here. - static_frame_allocation_size = 0; - unwind_codes.push(UnwindCode::SetFramePointer { offset: unwind_offset, sp_offset: 0, @@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info( let imm: i64 = imm.into(); assert!(imm <= core::u32::MAX as i64); - static_frame_allocation_size += imm as u32; + stack_size = Some(imm as u32); unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, @@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info( _ => {} } } - InstructionData::StackLoad { - opcode: Opcode::StackAddr, - stack_slot, - offset: _, - } => { - let result = func.dfg.inst_results(inst).get(0).unwrap(); - if let ValueLoc::Reg(frame_reg) = func.locations[*result] { - callee_save_region_reg = Some(frame_reg); - - // Figure out the offset in the call frame that `frame_reg` will have. - let frame_size = func - .stack_slots - .layout_info - .expect("func's stack slots have layout info if stack operations exist") - .frame_size; - // Because we're well after the prologue has been constructed, stack slots - // must have been laid out... - let slot_offset = func.stack_slots[stack_slot] - .offset - .expect("callee-save slot has an offset computed"); - let frame_offset = frame_size as i32 + slot_offset; - - callee_save_offset = Some(frame_offset as u32); - } - } InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], - flags: _flags, offset, + .. } => { - if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = + if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) = (func.locations[arg1], func.locations[arg2]) { - if Some(base_ru) == callee_save_region_reg { - let offset_int: i32 = offset.into(); - assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); - let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset"); - if FPR.contains(ru) { - saved_fpr = true; - unwind_codes.push(UnwindCode::SaveXmm { - offset: unwind_offset, - reg: ru as u8, - stack_offset: offset_int, - }); - } + // If this is a save of an FPR, record an unwind operation + // Note: the stack_offset here is relative to an adjusted SP + // This will be fixed up later to be based on the frame pointer offset + if dst == (RU::rsp as RegUnit) && FPR.contains(src) { + let offset: i32 = offset.into(); + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: src as u8, + stack_offset: offset as u32, + }); + + xmm_save_count += 1; } } } @@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info( assert!(found_end); - if saved_fpr { - if static_frame_allocation_size > 240 && saved_fpr { - warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ - This is a Cranelift implementation limit, see \ - https://github.com/bytecodealliance/wasmtime/issues/1475", - static_frame_allocation_size); - return Err(CodegenError::ImplLimitExceeded); + let mut frame_register_offset = 0; + if xmm_save_count > 0 { + // If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs) + // The "frame register offset" will point at the last slot used (i.e. the last saved FPR) + // Assumption: each FPR is stored at a lower address than the previous one + let mut last_stack_offset = None; + let mut fpr_save_count: u8 = 0; + let mut gpr_push_count: u8 = 0; + for code in unwind_codes.iter_mut() { + match code { + UnwindCode::SaveXmm { stack_offset, .. } => { + if let Some(last) = last_stack_offset { + assert!(last > *stack_offset); + } + last_stack_offset = Some(*stack_offset); + fpr_save_count += 1; + *stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16; + } + UnwindCode::PushRegister { .. } => { + gpr_push_count += 1; + } + _ => {} + } } - // Only test static frame size is 16-byte aligned when an FPR is saved to avoid - // panicking when alignment is elided because no FPRs are saved and no child calls are - // made. - assert!( - static_frame_allocation_size % 16 == 0, - "static frame allocation must be a multiple of 16" - ); - } + assert_eq!(fpr_save_count, xmm_save_count); - // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at - // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. - // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large - // frames are inexpressible, and we cannot actually compile the function. In case there are - // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing - // will actually check it. This, then, avoids panics when compiling functions with large - // call frames. - let reported_frame_offset = if saved_fpr { - (static_frame_allocation_size / 16) as u8 - } else { - 0 - }; + // Account for alignment space when there's an odd number of GPR pushes + frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2); + } Ok(Some(UnwindInfo { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register: frame_register.map(|r| GPR.index_of(r) as u8), - frame_register_offset: reported_frame_offset, + frame_register_offset, unwind_codes, })) } @@ -284,7 +237,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 9, - size: 64 + 32 + size: 64 } ] } @@ -303,7 +256,7 @@ mod tests { 0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg) 0x05, // Frame register + offset (RBP with 0 offset) 0x09, // Prolog offset - 0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes) + 0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset @@ -349,7 +302,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 27, - size: 10000 + 32 + size: 10000 } ] } @@ -369,8 +322,8 @@ mod tests { 0x05, // Frame register + offset (RBP with 0 offset) 0x1B, // Prolog offset 0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0) - 0xE6, // Low size byte - 0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes) + 0xE2, // Low size byte + 0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset @@ -414,7 +367,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 27, - size: 1000000 + 32 + size: 1000000 } ] } @@ -434,10 +387,10 @@ mod tests { 0x05, // Frame register + offset (RBP with 0 offset) 0x1B, // Prolog offset 0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1) - 0x60, // Byte 1 of size + 0x40, // Byte 1 of size 0x42, // Byte 2 of size 0x0F, // Byte 3 of size - 0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes) + 0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index bf77c0baef..8e8d356479 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall { block0(v0: i64): return } -; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { -; nextln: ss0 = incoming_arg 16, offset -48 +; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]): +; nextln: x86_push v1 +; nextln: copy_special %rsp -> %rbp +; nextln: v2 = x86_pop.i64 +; nextln: return v2 +; nextln: } ; check if we still use registers for 4 arguments function %four_args(i64, i64, i64, i64) windows_fastcall { block0(v0: i64, v1: i64, v2: i64, v3: i64): return } -; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } ; check if float arguments are passed through XMM registers function %four_float_args(f64, f64, f64, f64) windows_fastcall { block0(v0: f64, v1: f64, v2: f64, v3: f64): return } -; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } ; check if we use stack space for > 4 arguments function %five_args(i64, i64, i64, i64, i64) windows_fastcall { block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): return } -; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 8, offset 32 +; nextln: ss1 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]): +; nextln: x86_push v5 +; nextln: copy_special %rsp -> %rbp +; nextln: v6 = x86_pop.i64 +; nextln: return v6 +; nextln: } ; check that we preserve xmm6 and above if we're using them locally function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { @@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): [-, %xmm7] v5 = fadd v0, v1 return } -; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { -; nextln: ss0 = explicit_slot 32, offset -80 -; nextln: ss1 = incoming_arg 16, offset -48 -; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): -; nextln: x86_push v6 -; nextln: copy_special %rsp -> %rbp -; nextln: adjust_sp_down_imm 64 -; nextln: v7 = stack_addr.i64 ss0 -; nextln: store notrap aligned v8, v7 -; nextln: store notrap aligned v9, v7+16 -; check: v10 = stack_addr.i64 ss0 -; nextln: v11 = load.f64x2 notrap aligned v10 -; nextln: v12 = load.f64x2 notrap aligned v10+16 -; nextln: adjust_sp_up_imm 64 -; nextln: v13 = x86_pop.i64 -; nextln: v13, v11, v12 +; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { +; nextln: ss0 = incoming_arg 48, offset -48 +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): +; nextln: x86_push v7 +; nextln: copy_special %rsp -> %rbp +; nextln: adjust_sp_down_imm 32 +; nextln: store notrap aligned v8, v6+16 +; nextln: store notrap aligned v9, v6 +; nextln: v11 = load.f64x2 notrap aligned v6+16 +; nextln: v12 = load.f64x2 notrap aligned v6 +; nextln: adjust_sp_up_imm 32 +; nextln: v10 = x86_pop.i64 +; nextln: return v10, v11, v12 +; nextln: } function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): return } -; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall { block0(v0: f32, v1: f64, v2: i64, v3: i64): return v1 } -; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { +; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: regmove v1, %xmm1 -> %xmm0 +; nextln: v5 = x86_pop.i64 +; nextln: return v1, v5 +; nextln: } function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall { - fn0 = %foo(i64, i64, i64, i64) -> i64 - fn1 = %foo2(i64, i64, i64, i64) -> i64 + fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall + fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall block0(v0: i64): v1 = load.i64 v0+0 v2 = load.i64 v0+8 @@ -94,3 +135,100 @@ block0(v0: i64): store.i64 v9, v0+72 return v10 } +; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall { +; nextln: ss0 = spill_slot 8, offset -56 +; nextln: ss1 = spill_slot 8, offset -64 +; nextln: ss2 = spill_slot 8, offset -72 +; nextln: ss3 = spill_slot 8, offset -80 +; nextln: ss4 = spill_slot 8, offset -88 +; nextln: ss5 = spill_slot 8, offset -96 +; nextln: ss6 = spill_slot 8, offset -104 +; nextln: ss7 = spill_slot 8, offset -112 +; nextln: ss8 = spill_slot 8, offset -120 +; nextln: ss9 = spill_slot 8, offset -128 +; nextln: ss10 = incoming_arg 48, offset -48 +; nextln: ss11 = explicit_slot 32, offset -160 +; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall +; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall +; nextln: fn0 = %foo sig0 +; nextln: fn1 = %foo2 sig1 +; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]): +; nextln: x86_push v52 +; nextln: copy_special %rsp -> %rbp +; nextln: x86_push v53 +; nextln: x86_push v54 +; nextln: x86_push v55 +; nextln: x86_push v56 +; nextln: adjust_sp_down_imm 112 +; nextln: v0 = spill v11 +; nextln: v12 = copy_to_ssa.i64 %rcx +; nextln: v13 = load.i64 v12 +; nextln: v1 = spill v13 +; nextln: v14 = fill_nop v0 +; nextln: v15 = load.i64 v14+8 +; nextln: v2 = spill v15 +; nextln: v16 = fill_nop v0 +; nextln: v17 = load.i64 v16+16 +; nextln: v3 = spill v17 +; nextln: v18 = fill_nop v0 +; nextln: v19 = load.i64 v18+24 +; nextln: v4 = spill v19 +; nextln: v20 = fill_nop v0 +; nextln: v21 = load.i64 v20+32 +; nextln: v5 = spill v21 +; nextln: v22 = fill_nop v0 +; nextln: v23 = load.i64 v22+40 +; nextln: v6 = spill v23 +; nextln: v24 = fill_nop v0 +; nextln: v25 = load.i64 v24+48 +; nextln: v7 = spill v25 +; nextln: v26 = fill_nop v0 +; nextln: v27 = load.i64 v26+56 +; nextln: v8 = spill v27 +; nextln: v28 = fill_nop v0 +; nextln: v29 = load.i64 v28+64 +; nextln: v9 = spill v29 +; nextln: v30 = fill v1 +; nextln: v31 = fill v2 +; nextln: v32 = fill v3 +; nextln: v33 = fill v4 +; nextln: regmove v30, %r15 -> %rcx +; nextln: regmove v31, %r14 -> %rdx +; nextln: regmove v32, %r13 -> %r8 +; nextln: regmove v33, %r12 -> %r9 +; nextln: v10 = call fn0(v30, v31, v32, v33) +; nextln: v34 = fill v1 +; nextln: v35 = fill v0 +; nextln: store v34, v35+8 +; nextln: v36 = fill v2 +; nextln: v37 = fill_nop v0 +; nextln: store v36, v37+16 +; nextln: v38 = fill v3 +; nextln: v39 = fill_nop v0 +; nextln: store v38, v39+24 +; nextln: v40 = fill v4 +; nextln: v41 = fill_nop v0 +; nextln: store v40, v41+32 +; nextln: v42 = fill v5 +; nextln: v43 = fill_nop v0 +; nextln: store v42, v43+40 +; nextln: v44 = fill v6 +; nextln: v45 = fill_nop v0 +; nextln: store v44, v45+48 +; nextln: v46 = fill v7 +; nextln: v47 = fill_nop v0 +; nextln: store v46, v47+56 +; nextln: v48 = fill v8 +; nextln: v49 = fill_nop v0 +; nextln: store v48, v49+64 +; nextln: v50 = fill v9 +; nextln: v51 = fill_nop v0 +; nextln: store v50, v51+72 +; nextln: adjust_sp_up_imm 112 +; nextln: v61 = x86_pop.i64 +; nextln: v60 = x86_pop.i64 +; nextln: v59 = x86_pop.i64 +; nextln: v58 = x86_pop.i64 +; nextln: v57 = x86_pop.i64 +; nextln: return v10, v57, v58, v59, v60, v61 +; nextln: } \ No newline at end of file diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index 6997238bfb..4e5d4f18f3 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -3,13 +3,35 @@ set opt_level=speed_and_size set is_pic target x86_64 haswell -; check the unwind information with a function with no args -function %no_args() windows_fastcall { +; check the unwind information with a leaf function with no args +function %no_args_leaf() windows_fastcall { block0: return } ; sameln: version: 1 ; nextln: flags: 0 +; nextln: prologue size: 4 +; nextln: frame register: 5 +; nextln: frame register offset: 0 +; nextln: unwind codes: 2 +; nextln: +; nextln: offset: 1 +; nextln: op: PushNonvolatileRegister +; nextln: info: 5 +; nextln: +; nextln: offset: 4 +; nextln: op: SetFramePointer +; nextln: info: 0 + +; check the unwind information with a non-leaf function with no args +function %no_args() windows_fastcall { + fn0 = %foo() +block0: + call fn0() + return +} +; sameln: version: 1 +; nextln: flags: 0 ; nextln: prologue size: 8 ; nextln: frame register: 5 ; nextln: frame register offset: 0 @@ -51,7 +73,7 @@ block0: ; nextln: offset: 17 ; nextln: op: LargeStackAlloc ; nextln: info: 0 -; nextln: value: 12504 (u16) +; nextln: value: 12500 (u16) ; check a function with large-sized stack alloc function %large_stack() windows_fastcall { @@ -77,7 +99,7 @@ block0: ; nextln: offset: 17 ; nextln: op: LargeStackAlloc ; nextln: info: 1 -; nextln: value: 524320 (u32) +; nextln: value: 524288 (u32) function %fpr_with_function_call(i64, i64) windows_fastcall { fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall; @@ -113,9 +135,9 @@ block0(v0: i64, v1: i64): ; ; sameln: version: 1 ; nextln: flags: 0 -; nextln: prologue size: 25 +; nextln: prologue size: 22 ; nextln: frame register: 5 -; nextln: frame register offset: 12 +; nextln: frame register offset: 2 ; nextln: unwind codes: 5 ; nextln: ; nextln: offset: 1 @@ -135,10 +157,10 @@ block0(v0: i64, v1: i64): ; nextln: info: 0 ; nextln: value: 23 (u16) ; nextln: -; nextln: offset: 25 +; nextln: offset: 22 ; nextln: op: SaveXmm128 ; nextln: info: 15 -; nextln: value: 3 (u16) +; nextln: value: 0 (u16) ; check a function that has CSRs function %lots_of_registers(i64, i64) windows_fastcall { @@ -191,9 +213,9 @@ block0(v0: i64, v1: i64): } ; sameln: version: 1 ; nextln: flags: 0 -; nextln: prologue size: 41 +; nextln: prologue size: 35 ; nextln: frame register: 5 -; nextln: frame register offset: 10 +; nextln: frame register offset: 7 ; nextln: unwind codes: 13 ; nextln: ; nextln: offset: 1 @@ -234,19 +256,19 @@ block0(v0: i64, v1: i64): ; nextln: ; nextln: offset: 19 ; nextln: op: SmallStackAlloc -; nextln: info: 12 +; nextln: info: 8 ; nextln: -; nextln: offset: 31 +; nextln: offset: 24 ; nextln: op: SaveXmm128 ; nextln: info: 6 -; nextln: value: 0 (u16) +; nextln: value: 2 (u16) ; nextln: -; nextln: offset: 36 +; nextln: offset: 29 ; nextln: op: SaveXmm128 ; nextln: info: 7 ; nextln: value: 1 (u16) ; nextln: -; nextln: offset: 41 +; nextln: offset: 35 ; nextln: op: SaveXmm128 ; nextln: info: 8 -; nextln: value: 2 (u16) +; nextln: value: 0 (u16) From 2cd5ed1880a2c63a833b8943164c7069c506837b Mon Sep 17 00:00:00 2001 From: Peter Huene Date: Thu, 21 May 2020 15:51:55 -0700 Subject: [PATCH 2/4] Address code review feedback. --- cranelift/codegen/src/isa/x86/abi.rs | 28 ++++--------------- .../codegen/src/isa/x86/unwind/winx64.rs | 1 + 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index bc7eb94f0d..c072594b61 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -537,7 +537,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed if fpsr_stack_size > 0 { - csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32; + csr_stack_size = (csr_stack_size + 15) & !15; } func.create_stack_slot(ir::StackSlotData { @@ -615,14 +615,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); - insert_common_epilogues( - &mut pos, - local_stack_size, - reg_type, - &csrs, - sp_arg_index, - isa, - ); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index); Ok(()) } @@ -700,14 +693,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); - insert_common_epilogues( - &mut pos, - local_stack_size, - reg_type, - &csrs, - sp_arg_index, - isa, - ); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index); Ok(()) } @@ -839,7 +825,7 @@ fn insert_common_prologue( // Offset to where the register is saved relative to RSP, accounting for FPR save alignment let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 - + (stack_size & isa.pointer_bytes() as i64); + + (stack_size % types::F64X2.bytes() as i64); last_fpr_save = Some(pos.ins().store( ir::MemFlags::trusted(), @@ -983,13 +969,12 @@ fn insert_common_epilogues( reg_type: ir::types::Type, csrs: &RegisterSet, sp_arg_index: Option, - isa: &dyn TargetIsa, ) { while let Some(block) = pos.next_block() { pos.goto_last_inst(block); if let Some(inst) = pos.current_inst() { if pos.func.dfg[inst].opcode().is_return() { - insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa); + insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index); } } } @@ -1004,7 +989,6 @@ fn insert_common_epilogue( reg_type: ir::types::Type, csrs: &RegisterSet, sp_arg_index: Option, - isa: &dyn TargetIsa, ) { // Insert the pop of the frame pointer let fp_pop = pos.ins().x86_pop(reg_type); @@ -1041,7 +1025,7 @@ fn insert_common_epilogue( for (i, reg) in csrs.iter(FPR).enumerate() { // Offset to where the register is saved relative to RSP, accounting for FPR save alignment let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 - + (stack_size & isa.pointer_bytes() as i64); + + (stack_size % types::F64X2.bytes() as i64); let value = pos.ins().load( types::F64X2, diff --git a/cranelift/codegen/src/isa/x86/unwind/winx64.rs b/cranelift/codegen/src/isa/x86/unwind/winx64.rs index 4a9af95be9..af7d7b12f4 100644 --- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs +++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs @@ -163,6 +163,7 @@ pub(crate) fn create_unwind_info( assert_eq!(fpr_save_count, xmm_save_count); // Account for alignment space when there's an odd number of GPR pushes + // Assumption: an FPR (16 bytes) is twice the size of a GPR (8 bytes), hence the (rounded-up) integer division frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2); } From 54f9cd255f17a2c212967ca6fc1dfe2d7330b13a Mon Sep 17 00:00:00 2001 From: Peter Huene Date: Thu, 21 May 2020 15:53:47 -0700 Subject: [PATCH 3/4] Re-enable tests that were excluded due to the previous limitation. --- build.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/build.rs b/build.rs index 5f9231f3b5..a6e2b7d8bc 100644 --- a/build.rs +++ b/build.rs @@ -206,18 +206,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, - ("misc_testsuite", "export_large_signature") - | ("spec_testsuite", "call") - | ("spec_testsuite", "func") - | ("multi_value", "call") - | ("multi_value", "func") => { - // FIXME These involves functions with very large stack frames that Cranelift currently - // cannot compile using the fastcall (Windows) calling convention. - // See https://github.com/bytecodealliance/wasmtime/pull/1216. - #[cfg(windows)] - return true; - } - _ => {} }, _ => panic!("unrecognized strategy"), From ce5f3e153b117e1aee1832c6e6e0a698f030e57f Mon Sep 17 00:00:00 2001 From: Peter Huene Date: Thu, 21 May 2020 16:46:30 -0700 Subject: [PATCH 4/4] Only update XMM save unwind operation offsets when using a FP. This commit prevents updating the XMM save unwind operation offsets when a frame pointer is not used, even though currently Cranelift always uses a frame pointer. This will prevent incorrect unwind information in the future when we start omitting frame pointers. --- cranelift/codegen/src/isa/x86/unwind/winx64.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind/winx64.rs b/cranelift/codegen/src/isa/x86/unwind/winx64.rs index af7d7b12f4..de35fc6964 100644 --- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs +++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs @@ -136,9 +136,12 @@ pub(crate) fn create_unwind_info( assert!(found_end); + // When using a frame register, certain unwind operations, such as XMM saves, are relative to the frame + // register minus some offset, forming a "base address". This attempts to calculate the frame register offset + // while updating the XMM save offsets to be relative from this "base address" rather than RSP. let mut frame_register_offset = 0; - if xmm_save_count > 0 { - // If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs) + if frame_register.is_some() && xmm_save_count > 0 { + // Determine the number of 16-byte slots used for all CSRs (including GPRs) // The "frame register offset" will point at the last slot used (i.e. the last saved FPR) // Assumption: each FPR is stored at a lower address than the previous one let mut last_stack_offset = None;