Fix FPR saving and shadow space allocation for Windows x64.
This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475.
This commit is contained in:
@@ -6,7 +6,6 @@ use super::settings as isa_settings;
|
||||
use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
|
||||
use crate::cursor::{Cursor, CursorPosition, EncCursor};
|
||||
use crate::ir;
|
||||
use crate::ir::entities::StackSlot;
|
||||
use crate::ir::immediates::Imm64;
|
||||
use crate::ir::stackslot::{StackOffset, StackSize};
|
||||
use crate::ir::types;
|
||||
@@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
|
||||
use crate::result::CodegenResult;
|
||||
use crate::stack_layout::layout_stack;
|
||||
use alloc::borrow::Cow;
|
||||
use alloc::vec::Vec;
|
||||
use core::i32;
|
||||
use target_lexicon::{PointerWidth, Triple};
|
||||
|
||||
@@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
|
||||
///
|
||||
/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
|
||||
/// convention reserves spill space for parameters, you don’t have to use them as such"
|
||||
const WIN_SHADOW_STACK_SPACE: i32 = 32;
|
||||
const WIN_SHADOW_STACK_SPACE: StackSize = 32;
|
||||
|
||||
/// Stack alignment requirement for functions.
|
||||
///
|
||||
@@ -87,7 +85,7 @@ impl Args {
|
||||
WIN_SHADOW_STACK_SPACE
|
||||
} else {
|
||||
0
|
||||
} as u32;
|
||||
};
|
||||
|
||||
Self {
|
||||
pointer_bytes: bits / 8,
|
||||
@@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->
|
||||
|
||||
let word_size = StackSize::from(isa.pointer_bytes());
|
||||
let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
|
||||
WIN_SHADOW_STACK_SPACE as u32
|
||||
WIN_SHADOW_STACK_SPACE
|
||||
} else {
|
||||
0
|
||||
};
|
||||
@@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
|
||||
}
|
||||
|
||||
let csrs = callee_saved_regs_used(isa, func);
|
||||
|
||||
// The reserved stack area is composed of:
|
||||
// return address + frame pointer + all callee-saved registers + shadow space
|
||||
// return address + frame pointer + all callee-saved registers
|
||||
//
|
||||
// Pushing the return address is an implicit function of the `call`
|
||||
// instruction. Each of the others we will then push explicitly. Then we
|
||||
// will adjust the stack pointer to make room for the rest of the required
|
||||
// space for this frame.
|
||||
let word_size = isa.pointer_bytes() as usize;
|
||||
let num_fprs = csrs.iter(FPR).len();
|
||||
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
|
||||
let csrs = callee_saved_regs_used(isa, func);
|
||||
let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
|
||||
let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
|
||||
let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
|
||||
|
||||
// Only create an FPR stack slot if we're going to save FPRs.
|
||||
let fpr_slot = if num_fprs > 0 {
|
||||
// Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
|
||||
// seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
|
||||
// through `stack_load` and `stack_store` (see later comment about issue #1198). Even
|
||||
// though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
|
||||
// implies that `SpillSlot` may be eligible for certain optimizations, and we know with
|
||||
// certainty that this space may not be reused in the function, nor moved around.
|
||||
Some(func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::ExplicitSlot,
|
||||
size: (num_fprs * types::F64X2.bytes() as usize) as u32,
|
||||
offset: None,
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
|
||||
if fpsr_stack_size > 0 {
|
||||
csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32;
|
||||
}
|
||||
|
||||
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
|
||||
// since cranelift does not support spill slots before incoming args
|
||||
func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::IncomingArg,
|
||||
size: csr_stack_size as u32,
|
||||
offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
|
||||
size: csr_stack_size,
|
||||
offset: Some(-(csr_stack_size as StackOffset)),
|
||||
});
|
||||
|
||||
let is_leaf = func.is_leaf();
|
||||
|
||||
// If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
|
||||
if !is_leaf {
|
||||
// TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
|
||||
func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::ExplicitSlot,
|
||||
size: WIN_SHADOW_STACK_SPACE,
|
||||
offset: None,
|
||||
});
|
||||
}
|
||||
|
||||
let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
|
||||
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
|
||||
|
||||
// Subtract the GPR saved register size from the local size because pushes are used for the saves
|
||||
let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
|
||||
|
||||
// Add CSRs to function signature
|
||||
let reg_type = isa.pointer_type();
|
||||
let sp_arg_index = if fpsr_stack_size > 0 {
|
||||
let sp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::CalleeSaved,
|
||||
RU::rsp as RegUnit,
|
||||
);
|
||||
let index = func.signature.params.len();
|
||||
func.signature.params.push(sp_arg);
|
||||
Some(index)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let fp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::FramePointer,
|
||||
@@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
fpr_slot.as_ref(),
|
||||
sp_arg_index.is_some(),
|
||||
isa,
|
||||
);
|
||||
|
||||
@@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
fpr_slot.as_ref(),
|
||||
sp_arg_index,
|
||||
isa,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
|
||||
// Add CSRs to function signature
|
||||
let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
|
||||
if isa.pointer_bits() == 32 {
|
||||
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
||||
// to extract vmctx from the stack before we can save the frame pointer.
|
||||
let sp_arg_index = if isa.pointer_bits() == 32 {
|
||||
let sp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::CalleeSaved,
|
||||
RU::rsp as RegUnit,
|
||||
);
|
||||
let index = func.signature.params.len();
|
||||
func.signature.params.push(sp_arg);
|
||||
}
|
||||
Some(index)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let fp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::FramePointer,
|
||||
@@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
// Set up the cursor and insert the prologue
|
||||
let entry_block = func.layout.entry_block().expect("missing entry block");
|
||||
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
|
||||
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
|
||||
insert_common_prologue(
|
||||
&mut pos,
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
sp_arg_index.is_some(),
|
||||
isa,
|
||||
);
|
||||
|
||||
// Reset the cursor and insert the epilogue
|
||||
let mut pos = pos.at_position(CursorPosition::Nowhere);
|
||||
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
|
||||
insert_common_epilogues(
|
||||
&mut pos,
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
sp_arg_index,
|
||||
isa,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -690,12 +719,10 @@ fn insert_common_prologue(
|
||||
stack_size: i64,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
has_sp_param: bool,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
||||
// to extract vmctx from the stack before we can save the frame pointer.
|
||||
let sp = if isa.pointer_bits() == 32 {
|
||||
let sp = if has_sp_param {
|
||||
let block = pos.current_block().expect("missing block under cursor");
|
||||
let sp = pos.func.dfg.append_block_param(block, reg_type);
|
||||
pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
|
||||
@@ -799,38 +826,27 @@ fn insert_common_prologue(
|
||||
}
|
||||
}
|
||||
|
||||
// Now that RSP is prepared for the function, we can use stack slots:
|
||||
// With the stack pointer adjusted, save any callee-saved floating point registers via offset
|
||||
// FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
|
||||
let mut last_fpr_save = None;
|
||||
if let Some(fpr_slot) = fpr_slot {
|
||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
||||
|
||||
// `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
|
||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
||||
// instead of hoping that the addr/store will be combined later.
|
||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
||||
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||
// Append param to entry block
|
||||
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
||||
|
||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
||||
// function entry.
|
||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
||||
// Since regalloc has already run, we must assign a location.
|
||||
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
||||
|
||||
let mut fpr_offset = 0;
|
||||
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||
+ (stack_size & isa.pointer_bytes() as i64);
|
||||
|
||||
for reg in csrs.iter(FPR) {
|
||||
// Append param to entry Block
|
||||
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
||||
|
||||
// Since regalloc has already run, we must assign a location.
|
||||
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
||||
|
||||
last_fpr_save =
|
||||
Some(
|
||||
pos.ins()
|
||||
.store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
|
||||
);
|
||||
|
||||
fpr_offset += types::F64X2.bytes() as i32;
|
||||
}
|
||||
last_fpr_save = Some(pos.ins().store(
|
||||
ir::MemFlags::trusted(),
|
||||
csr_arg,
|
||||
sp.expect("FPR save requires SP param"),
|
||||
(stack_size - offset) as i32,
|
||||
));
|
||||
}
|
||||
|
||||
pos.func.prologue_end = Some(
|
||||
@@ -966,13 +982,14 @@ fn insert_common_epilogues(
|
||||
stack_size: i64,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
sp_arg_index: Option<usize>,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
while let Some(block) = pos.next_block() {
|
||||
pos.goto_last_inst(block);
|
||||
if let Some(inst) = pos.current_inst() {
|
||||
if pos.func.dfg[inst].opcode().is_return() {
|
||||
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
|
||||
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -986,56 +1003,9 @@ fn insert_common_epilogue(
|
||||
pos: &mut EncCursor,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
sp_arg_index: Option<usize>,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
// Even though instructions to restore FPRs are inserted first, we have to append them after
|
||||
// restored GPRs to satisfy parameter order in the return.
|
||||
let mut restored_fpr_values = Vec::new();
|
||||
|
||||
// Restore FPRs before we move RSP and invalidate stack slots.
|
||||
let mut first_fpr_load = None;
|
||||
if let Some(fpr_slot) = fpr_slot {
|
||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
||||
|
||||
// `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
|
||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
||||
// instead of hoping that the addr/store will be combined later.
|
||||
//
|
||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
||||
|
||||
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
||||
|
||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
||||
// function exit.
|
||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
||||
|
||||
let mut fpr_offset = 0;
|
||||
|
||||
for reg in csrs.iter(FPR) {
|
||||
let value = pos.ins().load(
|
||||
types::F64X2,
|
||||
ir::MemFlags::trusted(),
|
||||
stack_addr,
|
||||
fpr_offset,
|
||||
);
|
||||
fpr_offset += types::F64X2.bytes() as i32;
|
||||
|
||||
// Unlike GPRs before, we don't need to step back after reach restoration because FPR
|
||||
// restoration is order-insensitive. Furthermore: we want GPR restoration to begin
|
||||
// after FPR restoration, so that stack adjustments occur after we're done relying on
|
||||
// StackSlot validity.
|
||||
|
||||
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
||||
restored_fpr_values.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
let mut sp_adjust_inst = None;
|
||||
if stack_size > 0 {
|
||||
sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
|
||||
}
|
||||
|
||||
// Insert the pop of the frame pointer
|
||||
let fp_pop = pos.ins().x86_pop(reg_type);
|
||||
let fp_pop_inst = pos.prev_inst().unwrap();
|
||||
@@ -1046,13 +1016,47 @@ fn insert_common_epilogue(
|
||||
let mut first_csr_pop_inst = None;
|
||||
for reg in csrs.iter(GPR) {
|
||||
let csr_pop = pos.ins().x86_pop(reg_type);
|
||||
first_csr_pop_inst = Some(pos.prev_inst().unwrap());
|
||||
first_csr_pop_inst = pos.prev_inst();
|
||||
assert!(first_csr_pop_inst.is_some());
|
||||
pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
|
||||
pos.func.dfg.append_inst_arg(inst, csr_pop);
|
||||
}
|
||||
|
||||
for value in restored_fpr_values.into_iter() {
|
||||
pos.func.dfg.append_inst_arg(inst, value);
|
||||
// Insert the adjustment of SP
|
||||
let mut sp_adjust_inst = None;
|
||||
if stack_size > 0 {
|
||||
pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
|
||||
sp_adjust_inst = pos.prev_inst();
|
||||
assert!(sp_adjust_inst.is_some());
|
||||
}
|
||||
|
||||
let mut first_fpr_load = None;
|
||||
if let Some(index) = sp_arg_index {
|
||||
let sp = pos
|
||||
.func
|
||||
.dfg
|
||||
.block_params(pos.func.layout.entry_block().unwrap())[index];
|
||||
|
||||
// Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
|
||||
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||
+ (stack_size & isa.pointer_bytes() as i64);
|
||||
|
||||
let value = pos.ins().load(
|
||||
types::F64X2,
|
||||
ir::MemFlags::trusted(),
|
||||
sp,
|
||||
(stack_size - offset) as i32,
|
||||
);
|
||||
|
||||
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
||||
|
||||
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
||||
pos.func.dfg.append_inst_arg(inst, value);
|
||||
}
|
||||
} else {
|
||||
assert!(csrs.iter(FPR).len() == 0);
|
||||
}
|
||||
|
||||
pos.func.epilogues_start.push(
|
||||
|
||||
@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
|
||||
let mut prologue_size = 0;
|
||||
let mut unwind_codes = Vec::new();
|
||||
let mut found_end = false;
|
||||
|
||||
// Have we saved at least one FPR? if so, we might have to check additional constraints.
|
||||
let mut saved_fpr = false;
|
||||
|
||||
// In addition to the min offset for a callee-save, we need to know the offset from the
|
||||
// frame base to the stack pointer, so that we can record an unwind offset that spans only
|
||||
// to the end of callee-save space.
|
||||
let mut static_frame_allocation_size = 0u32;
|
||||
|
||||
// For the time being, FPR preservation is split into a stack_addr and later store/load.
|
||||
// Store the register used for stack store and ensure it is the same register with no
|
||||
// intervening changes to the frame size.
|
||||
let mut callee_save_region_reg = None;
|
||||
// Also record the callee-save region's offset from RSP, because it must be added to FPR
|
||||
// save offsets to compute an offset from the frame base.
|
||||
let mut callee_save_offset = None;
|
||||
let mut xmm_save_count: u8 = 0;
|
||||
|
||||
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
|
||||
// x64 ABI prologues cannot exceed 255 bytes in length
|
||||
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
|
||||
InstructionData::Unary { opcode, arg } => {
|
||||
match opcode {
|
||||
Opcode::X86Push => {
|
||||
static_frame_allocation_size += 8;
|
||||
|
||||
unwind_codes.push(UnwindCode::PushRegister {
|
||||
offset: unwind_offset,
|
||||
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
|
||||
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
|
||||
Opcode::AdjustSpDown => {
|
||||
let stack_size =
|
||||
stack_size.expect("expected a previous stack size instruction");
|
||||
static_frame_allocation_size += stack_size;
|
||||
|
||||
// This is used when calling a stack check function
|
||||
// We need to track the assignment to RAX which has the size of the stack
|
||||
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
|
||||
InstructionData::CopySpecial { src, dst, .. } => {
|
||||
if let Some(frame_register) = frame_register {
|
||||
if src == (RU::rsp as RegUnit) && dst == frame_register {
|
||||
// Constructing an rbp-based stack frame, so the static frame
|
||||
// allocation restarts at 0 from here.
|
||||
static_frame_allocation_size = 0;
|
||||
|
||||
unwind_codes.push(UnwindCode::SetFramePointer {
|
||||
offset: unwind_offset,
|
||||
sp_offset: 0,
|
||||
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
|
||||
let imm: i64 = imm.into();
|
||||
assert!(imm <= core::u32::MAX as i64);
|
||||
|
||||
static_frame_allocation_size += imm as u32;
|
||||
stack_size = Some(imm as u32);
|
||||
|
||||
unwind_codes.push(UnwindCode::StackAlloc {
|
||||
offset: unwind_offset,
|
||||
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
InstructionData::StackLoad {
|
||||
opcode: Opcode::StackAddr,
|
||||
stack_slot,
|
||||
offset: _,
|
||||
} => {
|
||||
let result = func.dfg.inst_results(inst).get(0).unwrap();
|
||||
if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
|
||||
callee_save_region_reg = Some(frame_reg);
|
||||
|
||||
// Figure out the offset in the call frame that `frame_reg` will have.
|
||||
let frame_size = func
|
||||
.stack_slots
|
||||
.layout_info
|
||||
.expect("func's stack slots have layout info if stack operations exist")
|
||||
.frame_size;
|
||||
// Because we're well after the prologue has been constructed, stack slots
|
||||
// must have been laid out...
|
||||
let slot_offset = func.stack_slots[stack_slot]
|
||||
.offset
|
||||
.expect("callee-save slot has an offset computed");
|
||||
let frame_offset = frame_size as i32 + slot_offset;
|
||||
|
||||
callee_save_offset = Some(frame_offset as u32);
|
||||
}
|
||||
}
|
||||
InstructionData::Store {
|
||||
opcode: Opcode::Store,
|
||||
args: [arg1, arg2],
|
||||
flags: _flags,
|
||||
offset,
|
||||
..
|
||||
} => {
|
||||
if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
|
||||
if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
|
||||
(func.locations[arg1], func.locations[arg2])
|
||||
{
|
||||
if Some(base_ru) == callee_save_region_reg {
|
||||
let offset_int: i32 = offset.into();
|
||||
assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
|
||||
let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
|
||||
if FPR.contains(ru) {
|
||||
saved_fpr = true;
|
||||
unwind_codes.push(UnwindCode::SaveXmm {
|
||||
offset: unwind_offset,
|
||||
reg: ru as u8,
|
||||
stack_offset: offset_int,
|
||||
});
|
||||
}
|
||||
// If this is a save of an FPR, record an unwind operation
|
||||
// Note: the stack_offset here is relative to an adjusted SP
|
||||
// This will be fixed up later to be based on the frame pointer offset
|
||||
if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
|
||||
let offset: i32 = offset.into();
|
||||
unwind_codes.push(UnwindCode::SaveXmm {
|
||||
offset: unwind_offset,
|
||||
reg: src as u8,
|
||||
stack_offset: offset as u32,
|
||||
});
|
||||
|
||||
xmm_save_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info(
|
||||
|
||||
assert!(found_end);
|
||||
|
||||
if saved_fpr {
|
||||
if static_frame_allocation_size > 240 && saved_fpr {
|
||||
warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
|
||||
This is a Cranelift implementation limit, see \
|
||||
https://github.com/bytecodealliance/wasmtime/issues/1475",
|
||||
static_frame_allocation_size);
|
||||
return Err(CodegenError::ImplLimitExceeded);
|
||||
let mut frame_register_offset = 0;
|
||||
if xmm_save_count > 0 {
|
||||
// If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs)
|
||||
// The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
|
||||
// Assumption: each FPR is stored at a lower address than the previous one
|
||||
let mut last_stack_offset = None;
|
||||
let mut fpr_save_count: u8 = 0;
|
||||
let mut gpr_push_count: u8 = 0;
|
||||
for code in unwind_codes.iter_mut() {
|
||||
match code {
|
||||
UnwindCode::SaveXmm { stack_offset, .. } => {
|
||||
if let Some(last) = last_stack_offset {
|
||||
assert!(last > *stack_offset);
|
||||
}
|
||||
last_stack_offset = Some(*stack_offset);
|
||||
fpr_save_count += 1;
|
||||
*stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
|
||||
}
|
||||
UnwindCode::PushRegister { .. } => {
|
||||
gpr_push_count += 1;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
// Only test static frame size is 16-byte aligned when an FPR is saved to avoid
|
||||
// panicking when alignment is elided because no FPRs are saved and no child calls are
|
||||
// made.
|
||||
assert!(
|
||||
static_frame_allocation_size % 16 == 0,
|
||||
"static frame allocation must be a multiple of 16"
|
||||
);
|
||||
}
|
||||
assert_eq!(fpr_save_count, xmm_save_count);
|
||||
|
||||
// Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
|
||||
// one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
|
||||
// Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
|
||||
// frames are inexpressible, and we cannot actually compile the function. In case there are
|
||||
// no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
|
||||
// will actually check it. This, then, avoids panics when compiling functions with large
|
||||
// call frames.
|
||||
let reported_frame_offset = if saved_fpr {
|
||||
(static_frame_allocation_size / 16) as u8
|
||||
} else {
|
||||
0
|
||||
};
|
||||
// Account for alignment space when there's an odd number of GPR pushes
|
||||
frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
|
||||
}
|
||||
|
||||
Ok(Some(UnwindInfo {
|
||||
flags: 0, // this assumes cranelift functions have no SEH handlers
|
||||
prologue_size: prologue_size as u8,
|
||||
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
|
||||
frame_register_offset: reported_frame_offset,
|
||||
frame_register_offset,
|
||||
unwind_codes,
|
||||
}))
|
||||
}
|
||||
@@ -284,7 +237,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 9,
|
||||
size: 64 + 32
|
||||
size: 64
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -303,7 +256,7 @@ mod tests {
|
||||
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x09, // Prolog offset
|
||||
0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
|
||||
0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
@@ -349,7 +302,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 27,
|
||||
size: 10000 + 32
|
||||
size: 10000
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -369,8 +322,8 @@ mod tests {
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x1B, // Prolog offset
|
||||
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
|
||||
0xE6, // Low size byte
|
||||
0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
|
||||
0xE2, // Low size byte
|
||||
0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
@@ -414,7 +367,7 @@ mod tests {
|
||||
},
|
||||
UnwindCode::StackAlloc {
|
||||
offset: 27,
|
||||
size: 1000000 + 32
|
||||
size: 1000000
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -434,10 +387,10 @@ mod tests {
|
||||
0x05, // Frame register + offset (RBP with 0 offset)
|
||||
0x1B, // Prolog offset
|
||||
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
|
||||
0x60, // Byte 1 of size
|
||||
0x40, // Byte 1 of size
|
||||
0x42, // Byte 2 of size
|
||||
0x0F, // Byte 3 of size
|
||||
0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
|
||||
0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
|
||||
0x05, // Prolog offset
|
||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||
0x02, // Prolog offset
|
||||
|
||||
Reference in New Issue
Block a user