Fix FPR saving and shadow space allocation for Windows x64.
This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475.
This commit is contained in:
@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
|
||||
let mut prologue_size = 0;
|
||||
let mut unwind_codes = Vec::new();
|
||||
let mut found_end = false;
|
||||
|
||||
// Have we saved at least one FPR? if so, we might have to check additional constraints.
|
||||
let mut saved_fpr = false;
|
||||
|
||||
// In addition to the min offset for a callee-save, we need to know the offset from the
|
||||
// frame base to the stack pointer, so that we can record an unwind offset that spans only
|
||||
// to the end of callee-save space.
|
||||
let mut static_frame_allocation_size = 0u32;
|
||||
|
||||
// For the time being, FPR preservation is split into a stack_addr and later store/load.
|
||||
// Store the register used for stack store and ensure it is the same register with no
|
||||
// intervening changes to the frame size.
|
||||
let mut callee_save_region_reg = None;
|
||||
// Also record the callee-save region's offset from RSP, because it must be added to FPR
|
||||
// save offsets to compute an offset from the frame base.
|
||||
let mut callee_save_offset = None;
|
||||
let mut xmm_save_count: u8 = 0;
|
||||
|
||||
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
|
||||
// x64 ABI prologues cannot exceed 255 bytes in length
|
||||
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
|
||||
InstructionData::Unary { opcode, arg } => {
|
||||
match opcode {
|
||||
Opcode::X86Push => {
|
||||
static_frame_allocation_size += 8;
|
||||
|
||||
unwind_codes.push(UnwindCode::PushRegister {
|
||||
offset: unwind_offset,
|
||||
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
|
||||
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
|
||||
Opcode::AdjustSpDown => {
|
||||
let stack_size =
|
||||
stack_size.expect("expected a previous stack size instruction");
|
||||
static_frame_allocation_size += stack_size;
|
||||
|
||||
// This is used when calling a stack check function
|
||||
// We need to track the assignment to RAX which has the size of the stack
|
||||
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
|
||||
InstructionData::CopySpecial { src, dst, .. } => {
|
||||
if let Some(frame_register) = frame_register {
|
||||
if src == (RU::rsp as RegUnit) && dst == frame_register {
|
||||
// Constructing an rbp-based stack frame, so the static frame
|
||||
// allocation restarts at 0 from here.
|
||||
static_frame_allocation_size = 0;
|
||||
|
||||
unwind_codes.push(UnwindCode::SetFramePointer {
|
||||
offset: unwind_offset,
|
||||
sp_offset: 0,
|
||||
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
|
||||
let imm: i64 = imm.into();
|
||||
assert!(imm <= core::u32::MAX as i64);
|
||||
|
||||
static_frame_allocation_size += imm as u32;
|
||||
stack_size = Some(imm as u32);
|
||||
|
||||
unwind_codes.push(UnwindCode::StackAlloc {
|
||||
offset: unwind_offset,
|
||||
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
InstructionData::StackLoad {
|
||||
opcode: Opcode::StackAddr,
|
||||
stack_slot,
|
||||
offset: _,
|
||||
} => {
|
||||
let result = func.dfg.inst_results(inst).get(0).unwrap();
|
||||
if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
|
||||
callee_save_region_reg = Some(frame_reg);
|
||||
|
||||
// Figure out the offset in the call frame that `frame_reg` will have.
|
||||
let frame_size = func
|
||||
.stack_slots
|
||||
.layout_info
|
||||
.expect("func's stack slots have layout info if stack operations exist")
|
||||
.frame_size;
|
||||
// Because we're well after the prologue has been constructed, stack slots
|
||||
// must have been laid out...
|
||||
let slot_offset = func.stack_slots[stack_slot]
|
||||
.offset
|
||||
.expect("callee-save slot has an offset computed");
|
||||
let frame_offset = frame_size as i32 + slot_offset;
|
||||
|
||||
callee_save_offset = Some(frame_offset as u32);
|
||||
}
|
||||
}
|
||||
InstructionData::Store {
|
||||
opcode: Opcode::Store,
|
||||
args: [arg1, arg2],
|
||||
flags: _flags,
|
||||
offset,
|
||||
..
|
||||
} => {
|
||||
if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
|
||||
if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
|
||||
(func.locations[arg1], func.locations[arg2])
|
||||
{
|
||||
if Some(base_ru) == callee_save_region_reg {
|
||||
let offset_int: i32 = offset.into();
|
||||
assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
|
||||
let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
|
||||
if FPR.contains(ru) {
|
||||
saved_fpr = true;
|
||||
unwind_codes.push(UnwindCode::SaveXmm {
|
||||
offset: unwind_offset,
|
||||
reg: ru as u8,
|
||||
stack_offset: offset_int,
|
||||
});
|
||||
}
|
||||
// If this is a save of an FPR, record an unwind operation
|
||||
// Note: the stack_offset here is relative to an adjusted SP
|
||||
// This will be fixed up later to be based on the frame pointer offset
|
||||
if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
|
||||
let offset: i32 = offset.into();
|
||||
unwind_codes.push(UnwindCode::SaveXmm {
|
||||
offset: unwind_offset,
|
||||
reg: src as u8,
|
||||
stack_offset: offset as u32,
|
||||
});
|
||||
|
||||
xmm_save_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info(
|
||||
|
||||
assert!(found_end);
|
||||
|
||||
if saved_fpr {
|
||||
if static_frame_allocation_size > 240 && saved_fpr {
|
||||
warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
|
||||
This is a Cranelift implementation limit, see \
|
||||
https://github.com/bytecodealliance/wasmtime/issues/1475",
|
||||
static_frame_allocation_size);
|
||||
return Err(CodegenError::ImplLimitExceeded);
|
||||
let mut frame_register_offset = 0;
|
||||
if xmm_save_count > 0 {
|
||||
// If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs)
|
||||
// The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
|
||||
// Assumption: each FPR is stored at a lower address than the previous one
|
||||
let mut last_stack_offset = None;
|
||||
let mut fpr_save_count: u8 = 0;
|
||||
let mut gpr_push_count: u8 = 0;
|
||||
for code in unwind_codes.iter_mut() {
|
||||
match code {
|
||||
UnwindCode::SaveXmm { stack_offset, .. } => {
|
||||
if let Some(last) = last_stack_offset {
|
||||
assert!(last > *stack_offset);
|
||||
}
|
||||
last_stack_offset = Some(*stack_offset);
|
||||
fpr_save_count += 1;
|
||||
*stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
|
||||
}
|
||||
UnwindCode::PushRegister { .. } => {
|
||||
gpr_push_count += 1;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
// Only test static frame size is 16-byte aligned when an FPR is saved to avoid
|
||||
// panicking when alignment is elided because no FPRs are saved and no child calls are
|
||||
// made.
|
||||
assert!(
|
||||
static_frame_allocation_size % 16 == 0,
|
||||
"static frame allocation must be a multiple of 16"
|
||||
);
|
||||
}
|
||||
assert_eq!(fpr_save_count, xmm_save_count);
|
||||
|
||||
// Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
|
||||
// one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
|
||||
// Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
|
||||
// frames are inexpressible, and we cannot actually compile the function. In case there are
|
||||
// no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
|
||||
// will actually check it. This, then, avoids panics when compiling functions with large
|
||||
// call frames.
|
||||
let reported_frame_offset = if saved_fpr {
|
||||
(static_frame_allocation_size / 16) as u8
|
||||
} else {
|
||||
0
|
||||
};
|
||||
// Account for alignment space when there's an odd number of GPR pushes
|
||||
frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
|
||||
}
|
||||
|
||||
Ok(Some(UnwindInfo {
|
||||
flags: 0, // this assumes cranelift functions have no SEH handlers
|
||||
prologue_size: prologue_size as u8,
|
||||
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
|
||||
frame_register_offset: reported_frame_offset,
|
||||
frame_register_offset,
|
||||
unwind_codes,
|
||||
}))
|
||||
}
|
||||
@@ -284,7 +237,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 9,
|
||||
size: 64 + 32
|
||||
size: 64
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -303,7 +256,7 @@ mod tests {
|
||||
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x09, // Prolog offset
|
||||
0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
|
||||
0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
@@ -349,7 +302,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 27,
|
||||
size: 10000 + 32
|
||||
size: 10000
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -369,8 +322,8 @@ mod tests {
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x1B, // Prolog offset
|
||||
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
|
||||
0xE6, // Low size byte
|
||||
0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
|
||||
0xE2, // Low size byte
|
||||
0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
@@ -414,7 +367,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 27,
|
||||
size: 1000000 + 32
|
||||
size: 1000000
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -434,10 +387,10 @@ mod tests {
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x1B, // Prolog offset
|
||||
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
|
||||
0x60, // Byte 1 of size
|
||||
0x40, // Byte 1 of size
|
||||
0x42, // Byte 2 of size
|
||||
0x0F, // Byte 3 of size
|
||||
0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
|
||||
0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
|
||||
Reference in New Issue
Block a user