Fix FPR saving and shadow space allocation for Windows x64.

This commit fixes both how FPR callee-saved registers are saved and how the
shadow space allocation occurs when laying out the stack for Windows x64
calling convention.

Importantly, this commit removes the compiler limitation of stack size for
Windows x64 that was imposed because FPR saves previously couldn't always be
represented in the unwind information.

The FPR saves are now performed without using stack slots, much like how the
callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so
that it is included in the frame size and to offset the layout of spills and
explicit slots.

The FPR saves are now done via an RSP offset (post adjustment) and they always
follow the GPR saves on the stack. A simpler calculation can now be made to
determine the proper offsets of the FPR saves for representing the unwind
information.

Additionally, the shadow space is no longer treated as an incoming argument,
but an explicit stack slot that gets laid out at the lowest address possible in
the local frame. This prevents `layout_stack` from putting a spill or explicit
slot in this reserved space. In the future, `layout_stack` should take
advantage of the *caller-provided* shadow space for spills, but this commit does
not attempt to address that.

The shadow space is now omitted from the local frame for leaf functions.

Fixes #1728.
Fixes #1587.
Fixes #1475.
This commit is contained in:
Peter Huene
2020-05-20 00:42:44 -07:00
parent c9e3b71c39
commit 78c3091e84
4 changed files with 380 additions and 263 deletions

View File

@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
let mut prologue_size = 0;
let mut unwind_codes = Vec::new();
let mut found_end = false;
// Have we saved at least one FPR? if so, we might have to check additional constraints.
let mut saved_fpr = false;
// In addition to the min offset for a callee-save, we need to know the offset from the
// frame base to the stack pointer, so that we can record an unwind offset that spans only
// to the end of callee-save space.
let mut static_frame_allocation_size = 0u32;
// For the time being, FPR preservation is split into a stack_addr and later store/load.
// Store the register used for stack store and ensure it is the same register with no
// intervening changes to the frame size.
let mut callee_save_region_reg = None;
// Also record the callee-save region's offset from RSP, because it must be added to FPR
// save offsets to compute an offset from the frame base.
let mut callee_save_offset = None;
let mut xmm_save_count: u8 = 0;
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
// x64 ABI prologues cannot exceed 255 bytes in length
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
InstructionData::Unary { opcode, arg } => {
match opcode {
Opcode::X86Push => {
static_frame_allocation_size += 8;
unwind_codes.push(UnwindCode::PushRegister {
offset: unwind_offset,
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
Opcode::AdjustSpDown => {
let stack_size =
stack_size.expect("expected a previous stack size instruction");
static_frame_allocation_size += stack_size;
// This is used when calling a stack check function
// We need to track the assignment to RAX which has the size of the stack
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
InstructionData::CopySpecial { src, dst, .. } => {
if let Some(frame_register) = frame_register {
if src == (RU::rsp as RegUnit) && dst == frame_register {
// Constructing an rbp-based stack frame, so the static frame
// allocation restarts at 0 from here.
static_frame_allocation_size = 0;
unwind_codes.push(UnwindCode::SetFramePointer {
offset: unwind_offset,
sp_offset: 0,
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
let imm: i64 = imm.into();
assert!(imm <= core::u32::MAX as i64);
static_frame_allocation_size += imm as u32;
stack_size = Some(imm as u32);
unwind_codes.push(UnwindCode::StackAlloc {
offset: unwind_offset,
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
_ => {}
}
}
InstructionData::StackLoad {
opcode: Opcode::StackAddr,
stack_slot,
offset: _,
} => {
let result = func.dfg.inst_results(inst).get(0).unwrap();
if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
callee_save_region_reg = Some(frame_reg);
// Figure out the offset in the call frame that `frame_reg` will have.
let frame_size = func
.stack_slots
.layout_info
.expect("func's stack slots have layout info if stack operations exist")
.frame_size;
// Because we're well after the prologue has been constructed, stack slots
// must have been laid out...
let slot_offset = func.stack_slots[stack_slot]
.offset
.expect("callee-save slot has an offset computed");
let frame_offset = frame_size as i32 + slot_offset;
callee_save_offset = Some(frame_offset as u32);
}
}
InstructionData::Store {
opcode: Opcode::Store,
args: [arg1, arg2],
flags: _flags,
offset,
..
} => {
if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
(func.locations[arg1], func.locations[arg2])
{
if Some(base_ru) == callee_save_region_reg {
let offset_int: i32 = offset.into();
assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
if FPR.contains(ru) {
saved_fpr = true;
unwind_codes.push(UnwindCode::SaveXmm {
offset: unwind_offset,
reg: ru as u8,
stack_offset: offset_int,
});
}
// If this is a save of an FPR, record an unwind operation
// Note: the stack_offset here is relative to an adjusted SP
// This will be fixed up later to be based on the frame pointer offset
if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
let offset: i32 = offset.into();
unwind_codes.push(UnwindCode::SaveXmm {
offset: unwind_offset,
reg: src as u8,
stack_offset: offset as u32,
});
xmm_save_count += 1;
}
}
}
@@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info(
assert!(found_end);
if saved_fpr {
if static_frame_allocation_size > 240 && saved_fpr {
warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
This is a Cranelift implementation limit, see \
https://github.com/bytecodealliance/wasmtime/issues/1475",
static_frame_allocation_size);
return Err(CodegenError::ImplLimitExceeded);
let mut frame_register_offset = 0;
if xmm_save_count > 0 {
// If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs)
// The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
// Assumption: each FPR is stored at a lower address than the previous one
let mut last_stack_offset = None;
let mut fpr_save_count: u8 = 0;
let mut gpr_push_count: u8 = 0;
for code in unwind_codes.iter_mut() {
match code {
UnwindCode::SaveXmm { stack_offset, .. } => {
if let Some(last) = last_stack_offset {
assert!(last > *stack_offset);
}
last_stack_offset = Some(*stack_offset);
fpr_save_count += 1;
*stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
}
UnwindCode::PushRegister { .. } => {
gpr_push_count += 1;
}
_ => {}
}
}
// Only test static frame size is 16-byte aligned when an FPR is saved to avoid
// panicking when alignment is elided because no FPRs are saved and no child calls are
// made.
assert!(
static_frame_allocation_size % 16 == 0,
"static frame allocation must be a multiple of 16"
);
}
assert_eq!(fpr_save_count, xmm_save_count);
// Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
// one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
// Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
// frames are inexpressible, and we cannot actually compile the function. In case there are
// no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
// will actually check it. This, then, avoids panics when compiling functions with large
// call frames.
let reported_frame_offset = if saved_fpr {
(static_frame_allocation_size / 16) as u8
} else {
0
};
// Account for alignment space when there's an odd number of GPR pushes
frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
}
Ok(Some(UnwindInfo {
flags: 0, // this assumes cranelift functions have no SEH handlers
prologue_size: prologue_size as u8,
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
frame_register_offset: reported_frame_offset,
frame_register_offset,
unwind_codes,
}))
}
@@ -284,7 +237,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 9,
size: 64 + 32
size: 64
}
]
}
@@ -303,7 +256,7 @@ mod tests {
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
0x05, // Frame register + offset (RBP with 0 offset)
0x09, // Prolog offset
0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset
@@ -349,7 +302,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 27,
size: 10000 + 32
size: 10000
}
]
}
@@ -369,8 +322,8 @@ mod tests {
0x05, // Frame register + offset (RBP with 0 offset)
0x1B, // Prolog offset
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
0xE6, // Low size byte
0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
0xE2, // Low size byte
0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset
@@ -414,7 +367,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 27,
size: 1000000 + 32
size: 1000000
}
]
}
@@ -434,10 +387,10 @@ mod tests {
0x05, // Frame register + offset (RBP with 0 offset)
0x1B, // Prolog offset
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
0x60, // Byte 1 of size
0x40, // Byte 1 of size
0x42, // Byte 2 of size
0x0F, // Byte 3 of size
0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset