Fix FPR saving and shadow space allocation for Windows x64.
This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475.
This commit is contained in:
@@ -6,7 +6,6 @@ use super::settings as isa_settings;
|
||||
use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
|
||||
use crate::cursor::{Cursor, CursorPosition, EncCursor};
|
||||
use crate::ir;
|
||||
use crate::ir::entities::StackSlot;
|
||||
use crate::ir::immediates::Imm64;
|
||||
use crate::ir::stackslot::{StackOffset, StackSize};
|
||||
use crate::ir::types;
|
||||
@@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
|
||||
use crate::result::CodegenResult;
|
||||
use crate::stack_layout::layout_stack;
|
||||
use alloc::borrow::Cow;
|
||||
use alloc::vec::Vec;
|
||||
use core::i32;
|
||||
use target_lexicon::{PointerWidth, Triple};
|
||||
|
||||
@@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
|
||||
///
|
||||
/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
|
||||
/// convention reserves spill space for parameters, you don’t have to use them as such"
|
||||
const WIN_SHADOW_STACK_SPACE: i32 = 32;
|
||||
const WIN_SHADOW_STACK_SPACE: StackSize = 32;
|
||||
|
||||
/// Stack alignment requirement for functions.
|
||||
///
|
||||
@@ -87,7 +85,7 @@ impl Args {
|
||||
WIN_SHADOW_STACK_SPACE
|
||||
} else {
|
||||
0
|
||||
} as u32;
|
||||
};
|
||||
|
||||
Self {
|
||||
pointer_bytes: bits / 8,
|
||||
@@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->
|
||||
|
||||
let word_size = StackSize::from(isa.pointer_bytes());
|
||||
let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
|
||||
WIN_SHADOW_STACK_SPACE as u32
|
||||
WIN_SHADOW_STACK_SPACE
|
||||
} else {
|
||||
0
|
||||
};
|
||||
@@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
|
||||
}
|
||||
|
||||
let csrs = callee_saved_regs_used(isa, func);
|
||||
|
||||
// The reserved stack area is composed of:
|
||||
// return address + frame pointer + all callee-saved registers + shadow space
|
||||
// return address + frame pointer + all callee-saved registers
|
||||
//
|
||||
// Pushing the return address is an implicit function of the `call`
|
||||
// instruction. Each of the others we will then push explicitly. Then we
|
||||
// will adjust the stack pointer to make room for the rest of the required
|
||||
// space for this frame.
|
||||
let word_size = isa.pointer_bytes() as usize;
|
||||
let num_fprs = csrs.iter(FPR).len();
|
||||
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
|
||||
let csrs = callee_saved_regs_used(isa, func);
|
||||
let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
|
||||
let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
|
||||
let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
|
||||
|
||||
// Only create an FPR stack slot if we're going to save FPRs.
|
||||
let fpr_slot = if num_fprs > 0 {
|
||||
// Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
|
||||
// seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
|
||||
// through `stack_load` and `stack_store` (see later comment about issue #1198). Even
|
||||
// though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
|
||||
// implies that `SpillSlot` may be eligible for certain optimizations, and we know with
|
||||
// certainty that this space may not be reused in the function, nor moved around.
|
||||
Some(func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::ExplicitSlot,
|
||||
size: (num_fprs * types::F64X2.bytes() as usize) as u32,
|
||||
offset: None,
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
|
||||
if fpsr_stack_size > 0 {
|
||||
csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32;
|
||||
}
|
||||
|
||||
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
|
||||
// since cranelift does not support spill slots before incoming args
|
||||
func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::IncomingArg,
|
||||
size: csr_stack_size as u32,
|
||||
offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
|
||||
size: csr_stack_size,
|
||||
offset: Some(-(csr_stack_size as StackOffset)),
|
||||
});
|
||||
|
||||
let is_leaf = func.is_leaf();
|
||||
|
||||
// If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
|
||||
if !is_leaf {
|
||||
// TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
|
||||
func.create_stack_slot(ir::StackSlotData {
|
||||
kind: ir::StackSlotKind::ExplicitSlot,
|
||||
size: WIN_SHADOW_STACK_SPACE,
|
||||
offset: None,
|
||||
});
|
||||
}
|
||||
|
||||
let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
|
||||
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
|
||||
|
||||
// Subtract the GPR saved register size from the local size because pushes are used for the saves
|
||||
let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
|
||||
|
||||
// Add CSRs to function signature
|
||||
let reg_type = isa.pointer_type();
|
||||
let sp_arg_index = if fpsr_stack_size > 0 {
|
||||
let sp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::CalleeSaved,
|
||||
RU::rsp as RegUnit,
|
||||
);
|
||||
let index = func.signature.params.len();
|
||||
func.signature.params.push(sp_arg);
|
||||
Some(index)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let fp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::FramePointer,
|
||||
@@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
fpr_slot.as_ref(),
|
||||
sp_arg_index.is_some(),
|
||||
isa,
|
||||
);
|
||||
|
||||
@@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
fpr_slot.as_ref(),
|
||||
sp_arg_index,
|
||||
isa,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
|
||||
// Add CSRs to function signature
|
||||
let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
|
||||
if isa.pointer_bits() == 32 {
|
||||
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
||||
// to extract vmctx from the stack before we can save the frame pointer.
|
||||
let sp_arg_index = if isa.pointer_bits() == 32 {
|
||||
let sp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::CalleeSaved,
|
||||
RU::rsp as RegUnit,
|
||||
);
|
||||
let index = func.signature.params.len();
|
||||
func.signature.params.push(sp_arg);
|
||||
}
|
||||
Some(index)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let fp_arg = ir::AbiParam::special_reg(
|
||||
reg_type,
|
||||
ir::ArgumentPurpose::FramePointer,
|
||||
@@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
||||
// Set up the cursor and insert the prologue
|
||||
let entry_block = func.layout.entry_block().expect("missing entry block");
|
||||
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
|
||||
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
|
||||
insert_common_prologue(
|
||||
&mut pos,
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
sp_arg_index.is_some(),
|
||||
isa,
|
||||
);
|
||||
|
||||
// Reset the cursor and insert the epilogue
|
||||
let mut pos = pos.at_position(CursorPosition::Nowhere);
|
||||
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
|
||||
insert_common_epilogues(
|
||||
&mut pos,
|
||||
local_stack_size,
|
||||
reg_type,
|
||||
&csrs,
|
||||
sp_arg_index,
|
||||
isa,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -690,12 +719,10 @@ fn insert_common_prologue(
|
||||
stack_size: i64,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
has_sp_param: bool,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
||||
// to extract vmctx from the stack before we can save the frame pointer.
|
||||
let sp = if isa.pointer_bits() == 32 {
|
||||
let sp = if has_sp_param {
|
||||
let block = pos.current_block().expect("missing block under cursor");
|
||||
let sp = pos.func.dfg.append_block_param(block, reg_type);
|
||||
pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
|
||||
@@ -799,38 +826,27 @@ fn insert_common_prologue(
|
||||
}
|
||||
}
|
||||
|
||||
// Now that RSP is prepared for the function, we can use stack slots:
|
||||
// With the stack pointer adjusted, save any callee-saved floating point registers via offset
|
||||
// FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
|
||||
let mut last_fpr_save = None;
|
||||
if let Some(fpr_slot) = fpr_slot {
|
||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
||||
|
||||
// `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
|
||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
||||
// instead of hoping that the addr/store will be combined later.
|
||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
||||
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||
// Append param to entry block
|
||||
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
||||
|
||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
||||
// function entry.
|
||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
||||
// Since regalloc has already run, we must assign a location.
|
||||
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
||||
|
||||
let mut fpr_offset = 0;
|
||||
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||
+ (stack_size & isa.pointer_bytes() as i64);
|
||||
|
||||
for reg in csrs.iter(FPR) {
|
||||
// Append param to entry Block
|
||||
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
||||
|
||||
// Since regalloc has already run, we must assign a location.
|
||||
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
||||
|
||||
last_fpr_save =
|
||||
Some(
|
||||
pos.ins()
|
||||
.store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
|
||||
);
|
||||
|
||||
fpr_offset += types::F64X2.bytes() as i32;
|
||||
}
|
||||
last_fpr_save = Some(pos.ins().store(
|
||||
ir::MemFlags::trusted(),
|
||||
csr_arg,
|
||||
sp.expect("FPR save requires SP param"),
|
||||
(stack_size - offset) as i32,
|
||||
));
|
||||
}
|
||||
|
||||
pos.func.prologue_end = Some(
|
||||
@@ -966,13 +982,14 @@ fn insert_common_epilogues(
|
||||
stack_size: i64,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
sp_arg_index: Option<usize>,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
while let Some(block) = pos.next_block() {
|
||||
pos.goto_last_inst(block);
|
||||
if let Some(inst) = pos.current_inst() {
|
||||
if pos.func.dfg[inst].opcode().is_return() {
|
||||
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
|
||||
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -986,56 +1003,9 @@ fn insert_common_epilogue(
|
||||
pos: &mut EncCursor,
|
||||
reg_type: ir::types::Type,
|
||||
csrs: &RegisterSet,
|
||||
fpr_slot: Option<&StackSlot>,
|
||||
sp_arg_index: Option<usize>,
|
||||
isa: &dyn TargetIsa,
|
||||
) {
|
||||
// Even though instructions to restore FPRs are inserted first, we have to append them after
|
||||
// restored GPRs to satisfy parameter order in the return.
|
||||
let mut restored_fpr_values = Vec::new();
|
||||
|
||||
// Restore FPRs before we move RSP and invalidate stack slots.
|
||||
let mut first_fpr_load = None;
|
||||
if let Some(fpr_slot) = fpr_slot {
|
||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
||||
|
||||
// `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
|
||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
||||
// instead of hoping that the addr/store will be combined later.
|
||||
//
|
||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
||||
|
||||
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
||||
|
||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
||||
// function exit.
|
||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
||||
|
||||
let mut fpr_offset = 0;
|
||||
|
||||
for reg in csrs.iter(FPR) {
|
||||
let value = pos.ins().load(
|
||||
types::F64X2,
|
||||
ir::MemFlags::trusted(),
|
||||
stack_addr,
|
||||
fpr_offset,
|
||||
);
|
||||
fpr_offset += types::F64X2.bytes() as i32;
|
||||
|
||||
// Unlike GPRs before, we don't need to step back after reach restoration because FPR
|
||||
// restoration is order-insensitive. Furthermore: we want GPR restoration to begin
|
||||
// after FPR restoration, so that stack adjustments occur after we're done relying on
|
||||
// StackSlot validity.
|
||||
|
||||
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
||||
restored_fpr_values.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
let mut sp_adjust_inst = None;
|
||||
if stack_size > 0 {
|
||||
sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
|
||||
}
|
||||
|
||||
// Insert the pop of the frame pointer
|
||||
let fp_pop = pos.ins().x86_pop(reg_type);
|
||||
let fp_pop_inst = pos.prev_inst().unwrap();
|
||||
@@ -1046,13 +1016,47 @@ fn insert_common_epilogue(
|
||||
let mut first_csr_pop_inst = None;
|
||||
for reg in csrs.iter(GPR) {
|
||||
let csr_pop = pos.ins().x86_pop(reg_type);
|
||||
first_csr_pop_inst = Some(pos.prev_inst().unwrap());
|
||||
first_csr_pop_inst = pos.prev_inst();
|
||||
assert!(first_csr_pop_inst.is_some());
|
||||
pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
|
||||
pos.func.dfg.append_inst_arg(inst, csr_pop);
|
||||
}
|
||||
|
||||
for value in restored_fpr_values.into_iter() {
|
||||
pos.func.dfg.append_inst_arg(inst, value);
|
||||
// Insert the adjustment of SP
|
||||
let mut sp_adjust_inst = None;
|
||||
if stack_size > 0 {
|
||||
pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
|
||||
sp_adjust_inst = pos.prev_inst();
|
||||
assert!(sp_adjust_inst.is_some());
|
||||
}
|
||||
|
||||
let mut first_fpr_load = None;
|
||||
if let Some(index) = sp_arg_index {
|
||||
let sp = pos
|
||||
.func
|
||||
.dfg
|
||||
.block_params(pos.func.layout.entry_block().unwrap())[index];
|
||||
|
||||
// Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
|
||||
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||
+ (stack_size & isa.pointer_bytes() as i64);
|
||||
|
||||
let value = pos.ins().load(
|
||||
types::F64X2,
|
||||
ir::MemFlags::trusted(),
|
||||
sp,
|
||||
(stack_size - offset) as i32,
|
||||
);
|
||||
|
||||
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
||||
|
||||
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
||||
pos.func.dfg.append_inst_arg(inst, value);
|
||||
}
|
||||
} else {
|
||||
assert!(csrs.iter(FPR).len() == 0);
|
||||
}
|
||||
|
||||
pos.func.epilogues_start.push(
|
||||
|
||||
Reference in New Issue
Block a user