diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 85ecb0c2a9..bc7eb94f0d 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -6,7 +6,6 @@ use super::settings as isa_settings; use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; use crate::cursor::{Cursor, CursorPosition, EncCursor}; use crate::ir; -use crate::ir::entities::StackSlot; use crate::ir::immediates::Imm64; use crate::ir::stackslot::{StackOffset, StackSize}; use crate::ir::types; @@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet; use crate::result::CodegenResult; use crate::stack_layout::layout_stack; use alloc::borrow::Cow; -use alloc::vec::Vec; use core::i32; use target_lexicon::{PointerWidth, Triple}; @@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax]; /// /// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling /// convention reserves spill space for parameters, you don’t have to use them as such" -const WIN_SHADOW_STACK_SPACE: i32 = 32; +const WIN_SHADOW_STACK_SPACE: StackSize = 32; /// Stack alignment requirement for functions. /// @@ -87,7 +85,7 @@ impl Args { WIN_SHADOW_STACK_SPACE } else { 0 - } as u32; + }; Self { pointer_bytes: bits / 8, @@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> let word_size = StackSize::from(isa.pointer_bytes()); let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() { - WIN_SHADOW_STACK_SPACE as u32 + WIN_SHADOW_STACK_SPACE } else { 0 }; @@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C panic!("TODO: windows-fastcall: x86-32 not implemented yet"); } - let csrs = callee_saved_regs_used(isa, func); - // The reserved stack area is composed of: - // return address + frame pointer + all callee-saved registers + shadow space + // return address + frame pointer + all callee-saved registers // // Pushing the return address is an implicit function of the `call` // instruction. Each of the others we will then push explicitly. Then we // will adjust the stack pointer to make room for the rest of the required // space for this frame. - let word_size = isa.pointer_bytes() as usize; - let num_fprs = csrs.iter(FPR).len(); - let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + let csrs = callee_saved_regs_used(isa, func); + let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32; + let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32; + let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size; - // Only create an FPR stack slot if we're going to save FPRs. - let fpr_slot = if num_fprs > 0 { - // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it - // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be - // through `stack_load` and `stack_store` (see later comment about issue #1198). Even - // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc - // implies that `SpillSlot` may be eligible for certain optimizations, and we know with - // certainty that this space may not be reused in the function, nor moved around. - Some(func.create_stack_slot(ir::StackSlotData { - kind: ir::StackSlotKind::ExplicitSlot, - size: (num_fprs * types::F64X2.bytes() as usize) as u32, - offset: None, - })) - } else { - None - }; + // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed + if fpsr_stack_size > 0 { + csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32; + } - // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work - // since cranelift does not support spill slots before incoming args func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, - size: csr_stack_size as u32, - offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)), + size: csr_stack_size, + offset: Some(-(csr_stack_size as StackOffset)), }); let is_leaf = func.is_leaf(); + + // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space + if !is_leaf { + // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::ExplicitSlot, + size: WIN_SHADOW_STACK_SPACE, + offset: None, + }); + } + let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32; - let local_stack_size = i64::from(total_stack_size - csr_stack_size); + + // Subtract the GPR saved register size from the local size because pushes are used for the saves + let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32); // Add CSRs to function signature let reg_type = isa.pointer_type(); + let sp_arg_index = if fpsr_stack_size > 0 { + let sp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::CalleeSaved, + RU::rsp as RegUnit, + ); + let index = func.signature.params.len(); + func.signature.params.push(sp_arg); + Some(index) + } else { + None + }; let fp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::FramePointer, @@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - fpr_slot.as_ref(), + sp_arg_index.is_some(), isa, ); @@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - fpr_slot.as_ref(), + sp_arg_index, + isa, ); Ok(()) @@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Add CSRs to function signature let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap(); - if isa.pointer_bits() == 32 { + // On X86-32 all parameters, including vmctx, are passed on stack, and we need + // to extract vmctx from the stack before we can save the frame pointer. + let sp_arg_index = if isa.pointer_bits() == 32 { let sp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::CalleeSaved, RU::rsp as RegUnit, ); + let index = func.signature.params.len(); func.signature.params.push(sp_arg); - } + Some(index) + } else { + None + }; let fp_arg = ir::AbiParam::special_reg( reg_type, ir::ArgumentPurpose::FramePointer, @@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Set up the cursor and insert the prologue let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa); + insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index.is_some(), + isa, + ); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); - insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None); + insert_common_epilogues( + &mut pos, + local_stack_size, + reg_type, + &csrs, + sp_arg_index, + isa, + ); Ok(()) } @@ -690,12 +719,10 @@ fn insert_common_prologue( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + has_sp_param: bool, isa: &dyn TargetIsa, ) { - // On X86-32 all parameters, including vmctx, are passed on stack, and we need - // to extract vmctx from the stack before we can save the frame pointer. - let sp = if isa.pointer_bits() == 32 { + let sp = if has_sp_param { let block = pos.current_block().expect("missing block under cursor"); let sp = pos.func.dfg.append_block_param(block, reg_type); pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit); @@ -799,38 +826,27 @@ fn insert_common_prologue( } } - // Now that RSP is prepared for the function, we can use stack slots: + // With the stack pointer adjusted, save any callee-saved floating point registers via offset + // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes let mut last_fpr_save = None; - if let Some(fpr_slot) = fpr_slot { - debug_assert!(csrs.iter(FPR).len() != 0); - // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + for (i, reg) in csrs.iter(FPR).enumerate() { + // Append param to entry block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); - // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at - // function entry. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + // Since regalloc has already run, we must assign a location. + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - let mut fpr_offset = 0; + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size & isa.pointer_bytes() as i64); - for reg in csrs.iter(FPR) { - // Append param to entry Block - let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); - - // Since regalloc has already run, we must assign a location. - pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - - last_fpr_save = - Some( - pos.ins() - .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset), - ); - - fpr_offset += types::F64X2.bytes() as i32; - } + last_fpr_save = Some(pos.ins().store( + ir::MemFlags::trusted(), + csr_arg, + sp.expect("FPR save requires SP param"), + (stack_size - offset) as i32, + )); } pos.func.prologue_end = Some( @@ -966,13 +982,14 @@ fn insert_common_epilogues( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + sp_arg_index: Option, + isa: &dyn TargetIsa, ) { while let Some(block) = pos.next_block() { pos.goto_last_inst(block); if let Some(inst) = pos.current_inst() { if pos.func.dfg[inst].opcode().is_return() { - insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot); + insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa); } } } @@ -986,56 +1003,9 @@ fn insert_common_epilogue( pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, - fpr_slot: Option<&StackSlot>, + sp_arg_index: Option, + isa: &dyn TargetIsa, ) { - // Even though instructions to restore FPRs are inserted first, we have to append them after - // restored GPRs to satisfy parameter order in the return. - let mut restored_fpr_values = Vec::new(); - - // Restore FPRs before we move RSP and invalidate stack slots. - let mut first_fpr_load = None; - if let Some(fpr_slot) = fpr_slot { - debug_assert!(csrs.iter(FPR).len() != 0); - - // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - // - // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); - - first_fpr_load.get_or_insert(pos.current_inst().expect("current inst")); - - // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at - // function exit. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); - - let mut fpr_offset = 0; - - for reg in csrs.iter(FPR) { - let value = pos.ins().load( - types::F64X2, - ir::MemFlags::trusted(), - stack_addr, - fpr_offset, - ); - fpr_offset += types::F64X2.bytes() as i32; - - // Unlike GPRs before, we don't need to step back after reach restoration because FPR - // restoration is order-insensitive. Furthermore: we want GPR restoration to begin - // after FPR restoration, so that stack adjustments occur after we're done relying on - // StackSlot validity. - - pos.func.locations[value] = ir::ValueLoc::Reg(reg); - restored_fpr_values.push(value); - } - } - - let mut sp_adjust_inst = None; - if stack_size > 0 { - sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size))); - } - // Insert the pop of the frame pointer let fp_pop = pos.ins().x86_pop(reg_type); let fp_pop_inst = pos.prev_inst().unwrap(); @@ -1046,13 +1016,47 @@ fn insert_common_epilogue( let mut first_csr_pop_inst = None; for reg in csrs.iter(GPR) { let csr_pop = pos.ins().x86_pop(reg_type); - first_csr_pop_inst = Some(pos.prev_inst().unwrap()); + first_csr_pop_inst = pos.prev_inst(); + assert!(first_csr_pop_inst.is_some()); pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg); pos.func.dfg.append_inst_arg(inst, csr_pop); } - for value in restored_fpr_values.into_iter() { - pos.func.dfg.append_inst_arg(inst, value); + // Insert the adjustment of SP + let mut sp_adjust_inst = None; + if stack_size > 0 { + pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)); + sp_adjust_inst = pos.prev_inst(); + assert!(sp_adjust_inst.is_some()); + } + + let mut first_fpr_load = None; + if let Some(index) = sp_arg_index { + let sp = pos + .func + .dfg + .block_params(pos.func.layout.entry_block().unwrap())[index]; + + // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads) + for (i, reg) in csrs.iter(FPR).enumerate() { + // Offset to where the register is saved relative to RSP, accounting for FPR save alignment + let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64 + + (stack_size & isa.pointer_bytes() as i64); + + let value = pos.ins().load( + types::F64X2, + ir::MemFlags::trusted(), + sp, + (stack_size - offset) as i32, + ); + + first_fpr_load.get_or_insert(pos.current_inst().expect("current inst")); + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, value); + } + } else { + assert!(csrs.iter(FPR).len() == 0); } pos.func.epilogues_start.push( diff --git a/cranelift/codegen/src/isa/x86/unwind/winx64.rs b/cranelift/codegen/src/isa/x86/unwind/winx64.rs index 60aff23f19..4a9af95be9 100644 --- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs +++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs @@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info( let mut prologue_size = 0; let mut unwind_codes = Vec::new(); let mut found_end = false; - - // Have we saved at least one FPR? if so, we might have to check additional constraints. - let mut saved_fpr = false; - - // In addition to the min offset for a callee-save, we need to know the offset from the - // frame base to the stack pointer, so that we can record an unwind offset that spans only - // to the end of callee-save space. - let mut static_frame_allocation_size = 0u32; - - // For the time being, FPR preservation is split into a stack_addr and later store/load. - // Store the register used for stack store and ensure it is the same register with no - // intervening changes to the frame size. - let mut callee_save_region_reg = None; - // Also record the callee-save region's offset from RSP, because it must be added to FPR - // save offsets to compute an offset from the frame base. - let mut callee_save_offset = None; + let mut xmm_save_count: u8 = 0; for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length @@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info( InstructionData::Unary { opcode, arg } => { match opcode { Opcode::X86Push => { - static_frame_allocation_size += 8; - unwind_codes.push(UnwindCode::PushRegister { offset: unwind_offset, reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8, @@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info( Opcode::AdjustSpDown => { let stack_size = stack_size.expect("expected a previous stack size instruction"); - static_frame_allocation_size += stack_size; // This is used when calling a stack check function // We need to track the assignment to RAX which has the size of the stack @@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info( InstructionData::CopySpecial { src, dst, .. } => { if let Some(frame_register) = frame_register { if src == (RU::rsp as RegUnit) && dst == frame_register { - // Constructing an rbp-based stack frame, so the static frame - // allocation restarts at 0 from here. - static_frame_allocation_size = 0; - unwind_codes.push(UnwindCode::SetFramePointer { offset: unwind_offset, sp_offset: 0, @@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info( let imm: i64 = imm.into(); assert!(imm <= core::u32::MAX as i64); - static_frame_allocation_size += imm as u32; + stack_size = Some(imm as u32); unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, @@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info( _ => {} } } - InstructionData::StackLoad { - opcode: Opcode::StackAddr, - stack_slot, - offset: _, - } => { - let result = func.dfg.inst_results(inst).get(0).unwrap(); - if let ValueLoc::Reg(frame_reg) = func.locations[*result] { - callee_save_region_reg = Some(frame_reg); - - // Figure out the offset in the call frame that `frame_reg` will have. - let frame_size = func - .stack_slots - .layout_info - .expect("func's stack slots have layout info if stack operations exist") - .frame_size; - // Because we're well after the prologue has been constructed, stack slots - // must have been laid out... - let slot_offset = func.stack_slots[stack_slot] - .offset - .expect("callee-save slot has an offset computed"); - let frame_offset = frame_size as i32 + slot_offset; - - callee_save_offset = Some(frame_offset as u32); - } - } InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], - flags: _flags, offset, + .. } => { - if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = + if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) = (func.locations[arg1], func.locations[arg2]) { - if Some(base_ru) == callee_save_region_reg { - let offset_int: i32 = offset.into(); - assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); - let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset"); - if FPR.contains(ru) { - saved_fpr = true; - unwind_codes.push(UnwindCode::SaveXmm { - offset: unwind_offset, - reg: ru as u8, - stack_offset: offset_int, - }); - } + // If this is a save of an FPR, record an unwind operation + // Note: the stack_offset here is relative to an adjusted SP + // This will be fixed up later to be based on the frame pointer offset + if dst == (RU::rsp as RegUnit) && FPR.contains(src) { + let offset: i32 = offset.into(); + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: src as u8, + stack_offset: offset as u32, + }); + + xmm_save_count += 1; } } } @@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info( assert!(found_end); - if saved_fpr { - if static_frame_allocation_size > 240 && saved_fpr { - warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ - This is a Cranelift implementation limit, see \ - https://github.com/bytecodealliance/wasmtime/issues/1475", - static_frame_allocation_size); - return Err(CodegenError::ImplLimitExceeded); + let mut frame_register_offset = 0; + if xmm_save_count > 0 { + // If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs) + // The "frame register offset" will point at the last slot used (i.e. the last saved FPR) + // Assumption: each FPR is stored at a lower address than the previous one + let mut last_stack_offset = None; + let mut fpr_save_count: u8 = 0; + let mut gpr_push_count: u8 = 0; + for code in unwind_codes.iter_mut() { + match code { + UnwindCode::SaveXmm { stack_offset, .. } => { + if let Some(last) = last_stack_offset { + assert!(last > *stack_offset); + } + last_stack_offset = Some(*stack_offset); + fpr_save_count += 1; + *stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16; + } + UnwindCode::PushRegister { .. } => { + gpr_push_count += 1; + } + _ => {} + } } - // Only test static frame size is 16-byte aligned when an FPR is saved to avoid - // panicking when alignment is elided because no FPRs are saved and no child calls are - // made. - assert!( - static_frame_allocation_size % 16 == 0, - "static frame allocation must be a multiple of 16" - ); - } + assert_eq!(fpr_save_count, xmm_save_count); - // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at - // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. - // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large - // frames are inexpressible, and we cannot actually compile the function. In case there are - // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing - // will actually check it. This, then, avoids panics when compiling functions with large - // call frames. - let reported_frame_offset = if saved_fpr { - (static_frame_allocation_size / 16) as u8 - } else { - 0 - }; + // Account for alignment space when there's an odd number of GPR pushes + frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2); + } Ok(Some(UnwindInfo { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register: frame_register.map(|r| GPR.index_of(r) as u8), - frame_register_offset: reported_frame_offset, + frame_register_offset, unwind_codes, })) } @@ -284,7 +237,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 9, - size: 64 + 32 + size: 64 } ] } @@ -303,7 +256,7 @@ mod tests { 0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg) 0x05, // Frame register + offset (RBP with 0 offset) 0x09, // Prolog offset - 0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes) + 0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset @@ -349,7 +302,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 27, - size: 10000 + 32 + size: 10000 } ] } @@ -369,8 +322,8 @@ mod tests { 0x05, // Frame register + offset (RBP with 0 offset) 0x1B, // Prolog offset 0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0) - 0xE6, // Low size byte - 0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes) + 0xE2, // Low size byte + 0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset @@ -414,7 +367,7 @@ mod tests { }, UnwindCode::StackAlloc { offset: 27, - size: 1000000 + 32 + size: 1000000 } ] } @@ -434,10 +387,10 @@ mod tests { 0x05, // Frame register + offset (RBP with 0 offset) 0x1B, // Prolog offset 0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1) - 0x60, // Byte 1 of size + 0x40, // Byte 1 of size 0x42, // Byte 2 of size 0x0F, // Byte 3 of size - 0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes) + 0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes) 0x05, // Prolog offset 0x03, // Operation 3 (save frame register), stack pointer offset = 0 0x02, // Prolog offset diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index bf77c0baef..8e8d356479 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall { block0(v0: i64): return } -; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { -; nextln: ss0 = incoming_arg 16, offset -48 +; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]): +; nextln: x86_push v1 +; nextln: copy_special %rsp -> %rbp +; nextln: v2 = x86_pop.i64 +; nextln: return v2 +; nextln: } ; check if we still use registers for 4 arguments function %four_args(i64, i64, i64, i64) windows_fastcall { block0(v0: i64, v1: i64, v2: i64, v3: i64): return } -; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } ; check if float arguments are passed through XMM registers function %four_float_args(f64, f64, f64, f64) windows_fastcall { block0(v0: f64, v1: f64, v2: f64, v3: f64): return } -; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } ; check if we use stack space for > 4 arguments function %five_args(i64, i64, i64, i64, i64) windows_fastcall { block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): return } -; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 8, offset 32 +; nextln: ss1 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]): +; nextln: x86_push v5 +; nextln: copy_special %rsp -> %rbp +; nextln: v6 = x86_pop.i64 +; nextln: return v6 +; nextln: } ; check that we preserve xmm6 and above if we're using them locally function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { @@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): [-, %xmm7] v5 = fadd v0, v1 return } -; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { -; nextln: ss0 = explicit_slot 32, offset -80 -; nextln: ss1 = incoming_arg 16, offset -48 -; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): -; nextln: x86_push v6 -; nextln: copy_special %rsp -> %rbp -; nextln: adjust_sp_down_imm 64 -; nextln: v7 = stack_addr.i64 ss0 -; nextln: store notrap aligned v8, v7 -; nextln: store notrap aligned v9, v7+16 -; check: v10 = stack_addr.i64 ss0 -; nextln: v11 = load.f64x2 notrap aligned v10 -; nextln: v12 = load.f64x2 notrap aligned v10+16 -; nextln: adjust_sp_up_imm 64 -; nextln: v13 = x86_pop.i64 -; nextln: v13, v11, v12 +; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { +; nextln: ss0 = incoming_arg 48, offset -48 +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): +; nextln: x86_push v7 +; nextln: copy_special %rsp -> %rbp +; nextln: adjust_sp_down_imm 32 +; nextln: store notrap aligned v8, v6+16 +; nextln: store notrap aligned v9, v6 +; nextln: v11 = load.f64x2 notrap aligned v6+16 +; nextln: v12 = load.f64x2 notrap aligned v6 +; nextln: adjust_sp_up_imm 32 +; nextln: v10 = x86_pop.i64 +; nextln: return v10, v11, v12 +; nextln: } function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): return } -; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: v5 = x86_pop.i64 +; nextln: return v5 +; nextln: } function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall { block0(v0: f32, v1: f64, v2: i64, v3: i64): return v1 } -; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { +; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -16 +; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]): +; nextln: x86_push v4 +; nextln: copy_special %rsp -> %rbp +; nextln: regmove v1, %xmm1 -> %xmm0 +; nextln: v5 = x86_pop.i64 +; nextln: return v1, v5 +; nextln: } function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall { - fn0 = %foo(i64, i64, i64, i64) -> i64 - fn1 = %foo2(i64, i64, i64, i64) -> i64 + fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall + fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall block0(v0: i64): v1 = load.i64 v0+0 v2 = load.i64 v0+8 @@ -94,3 +135,100 @@ block0(v0: i64): store.i64 v9, v0+72 return v10 } +; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall { +; nextln: ss0 = spill_slot 8, offset -56 +; nextln: ss1 = spill_slot 8, offset -64 +; nextln: ss2 = spill_slot 8, offset -72 +; nextln: ss3 = spill_slot 8, offset -80 +; nextln: ss4 = spill_slot 8, offset -88 +; nextln: ss5 = spill_slot 8, offset -96 +; nextln: ss6 = spill_slot 8, offset -104 +; nextln: ss7 = spill_slot 8, offset -112 +; nextln: ss8 = spill_slot 8, offset -120 +; nextln: ss9 = spill_slot 8, offset -128 +; nextln: ss10 = incoming_arg 48, offset -48 +; nextln: ss11 = explicit_slot 32, offset -160 +; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall +; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall +; nextln: fn0 = %foo sig0 +; nextln: fn1 = %foo2 sig1 +; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]): +; nextln: x86_push v52 +; nextln: copy_special %rsp -> %rbp +; nextln: x86_push v53 +; nextln: x86_push v54 +; nextln: x86_push v55 +; nextln: x86_push v56 +; nextln: adjust_sp_down_imm 112 +; nextln: v0 = spill v11 +; nextln: v12 = copy_to_ssa.i64 %rcx +; nextln: v13 = load.i64 v12 +; nextln: v1 = spill v13 +; nextln: v14 = fill_nop v0 +; nextln: v15 = load.i64 v14+8 +; nextln: v2 = spill v15 +; nextln: v16 = fill_nop v0 +; nextln: v17 = load.i64 v16+16 +; nextln: v3 = spill v17 +; nextln: v18 = fill_nop v0 +; nextln: v19 = load.i64 v18+24 +; nextln: v4 = spill v19 +; nextln: v20 = fill_nop v0 +; nextln: v21 = load.i64 v20+32 +; nextln: v5 = spill v21 +; nextln: v22 = fill_nop v0 +; nextln: v23 = load.i64 v22+40 +; nextln: v6 = spill v23 +; nextln: v24 = fill_nop v0 +; nextln: v25 = load.i64 v24+48 +; nextln: v7 = spill v25 +; nextln: v26 = fill_nop v0 +; nextln: v27 = load.i64 v26+56 +; nextln: v8 = spill v27 +; nextln: v28 = fill_nop v0 +; nextln: v29 = load.i64 v28+64 +; nextln: v9 = spill v29 +; nextln: v30 = fill v1 +; nextln: v31 = fill v2 +; nextln: v32 = fill v3 +; nextln: v33 = fill v4 +; nextln: regmove v30, %r15 -> %rcx +; nextln: regmove v31, %r14 -> %rdx +; nextln: regmove v32, %r13 -> %r8 +; nextln: regmove v33, %r12 -> %r9 +; nextln: v10 = call fn0(v30, v31, v32, v33) +; nextln: v34 = fill v1 +; nextln: v35 = fill v0 +; nextln: store v34, v35+8 +; nextln: v36 = fill v2 +; nextln: v37 = fill_nop v0 +; nextln: store v36, v37+16 +; nextln: v38 = fill v3 +; nextln: v39 = fill_nop v0 +; nextln: store v38, v39+24 +; nextln: v40 = fill v4 +; nextln: v41 = fill_nop v0 +; nextln: store v40, v41+32 +; nextln: v42 = fill v5 +; nextln: v43 = fill_nop v0 +; nextln: store v42, v43+40 +; nextln: v44 = fill v6 +; nextln: v45 = fill_nop v0 +; nextln: store v44, v45+48 +; nextln: v46 = fill v7 +; nextln: v47 = fill_nop v0 +; nextln: store v46, v47+56 +; nextln: v48 = fill v8 +; nextln: v49 = fill_nop v0 +; nextln: store v48, v49+64 +; nextln: v50 = fill v9 +; nextln: v51 = fill_nop v0 +; nextln: store v50, v51+72 +; nextln: adjust_sp_up_imm 112 +; nextln: v61 = x86_pop.i64 +; nextln: v60 = x86_pop.i64 +; nextln: v59 = x86_pop.i64 +; nextln: v58 = x86_pop.i64 +; nextln: v57 = x86_pop.i64 +; nextln: return v10, v57, v58, v59, v60, v61 +; nextln: } \ No newline at end of file diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index 6997238bfb..4e5d4f18f3 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -3,13 +3,35 @@ set opt_level=speed_and_size set is_pic target x86_64 haswell -; check the unwind information with a function with no args -function %no_args() windows_fastcall { +; check the unwind information with a leaf function with no args +function %no_args_leaf() windows_fastcall { block0: return } ; sameln: version: 1 ; nextln: flags: 0 +; nextln: prologue size: 4 +; nextln: frame register: 5 +; nextln: frame register offset: 0 +; nextln: unwind codes: 2 +; nextln: +; nextln: offset: 1 +; nextln: op: PushNonvolatileRegister +; nextln: info: 5 +; nextln: +; nextln: offset: 4 +; nextln: op: SetFramePointer +; nextln: info: 0 + +; check the unwind information with a non-leaf function with no args +function %no_args() windows_fastcall { + fn0 = %foo() +block0: + call fn0() + return +} +; sameln: version: 1 +; nextln: flags: 0 ; nextln: prologue size: 8 ; nextln: frame register: 5 ; nextln: frame register offset: 0 @@ -51,7 +73,7 @@ block0: ; nextln: offset: 17 ; nextln: op: LargeStackAlloc ; nextln: info: 0 -; nextln: value: 12504 (u16) +; nextln: value: 12500 (u16) ; check a function with large-sized stack alloc function %large_stack() windows_fastcall { @@ -77,7 +99,7 @@ block0: ; nextln: offset: 17 ; nextln: op: LargeStackAlloc ; nextln: info: 1 -; nextln: value: 524320 (u32) +; nextln: value: 524288 (u32) function %fpr_with_function_call(i64, i64) windows_fastcall { fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall; @@ -113,9 +135,9 @@ block0(v0: i64, v1: i64): ; ; sameln: version: 1 ; nextln: flags: 0 -; nextln: prologue size: 25 +; nextln: prologue size: 22 ; nextln: frame register: 5 -; nextln: frame register offset: 12 +; nextln: frame register offset: 2 ; nextln: unwind codes: 5 ; nextln: ; nextln: offset: 1 @@ -135,10 +157,10 @@ block0(v0: i64, v1: i64): ; nextln: info: 0 ; nextln: value: 23 (u16) ; nextln: -; nextln: offset: 25 +; nextln: offset: 22 ; nextln: op: SaveXmm128 ; nextln: info: 15 -; nextln: value: 3 (u16) +; nextln: value: 0 (u16) ; check a function that has CSRs function %lots_of_registers(i64, i64) windows_fastcall { @@ -191,9 +213,9 @@ block0(v0: i64, v1: i64): } ; sameln: version: 1 ; nextln: flags: 0 -; nextln: prologue size: 41 +; nextln: prologue size: 35 ; nextln: frame register: 5 -; nextln: frame register offset: 10 +; nextln: frame register offset: 7 ; nextln: unwind codes: 13 ; nextln: ; nextln: offset: 1 @@ -234,19 +256,19 @@ block0(v0: i64, v1: i64): ; nextln: ; nextln: offset: 19 ; nextln: op: SmallStackAlloc -; nextln: info: 12 +; nextln: info: 8 ; nextln: -; nextln: offset: 31 +; nextln: offset: 24 ; nextln: op: SaveXmm128 ; nextln: info: 6 -; nextln: value: 0 (u16) +; nextln: value: 2 (u16) ; nextln: -; nextln: offset: 36 +; nextln: offset: 29 ; nextln: op: SaveXmm128 ; nextln: info: 7 ; nextln: value: 1 (u16) ; nextln: -; nextln: offset: 41 +; nextln: offset: 35 ; nextln: op: SaveXmm128 ; nextln: info: 8 -; nextln: value: 2 (u16) +; nextln: value: 0 (u16)