Merge pull request #1734 from peterhuene/fix-saved-fprs

Cranelift: Fix FPR saving and shadow space allocation for Windows x64.
This commit is contained in:
Peter Huene
2020-05-22 12:06:37 -07:00
committed by GitHub
5 changed files with 374 additions and 281 deletions

View File

@@ -206,18 +206,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("reference_types", "table_copy_on_imported_tables") => return false,
("reference_types", _) => return true,
("misc_testsuite", "export_large_signature")
| ("spec_testsuite", "call")
| ("spec_testsuite", "func")
| ("multi_value", "call")
| ("multi_value", "func") => {
// FIXME These involves functions with very large stack frames that Cranelift currently
// cannot compile using the fastcall (Windows) calling convention.
// See https://github.com/bytecodealliance/wasmtime/pull/1216.
#[cfg(windows)]
return true;
}
_ => {}
},
_ => panic!("unrecognized strategy"),

View File

@@ -6,7 +6,6 @@ use super::settings as isa_settings;
use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
use crate::cursor::{Cursor, CursorPosition, EncCursor};
use crate::ir;
use crate::ir::entities::StackSlot;
use crate::ir::immediates::Imm64;
use crate::ir::stackslot::{StackOffset, StackSize};
use crate::ir::types;
@@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
use crate::result::CodegenResult;
use crate::stack_layout::layout_stack;
use alloc::borrow::Cow;
use alloc::vec::Vec;
use core::i32;
use target_lexicon::{PointerWidth, Triple};
@@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
///
/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
/// convention reserves spill space for parameters, you dont have to use them as such"
const WIN_SHADOW_STACK_SPACE: i32 = 32;
const WIN_SHADOW_STACK_SPACE: StackSize = 32;
/// Stack alignment requirement for functions.
///
@@ -87,7 +85,7 @@ impl Args {
WIN_SHADOW_STACK_SPACE
} else {
0
} as u32;
};
Self {
pointer_bytes: bits / 8,
@@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->
let word_size = StackSize::from(isa.pointer_bytes());
let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
WIN_SHADOW_STACK_SPACE as u32
WIN_SHADOW_STACK_SPACE
} else {
0
};
@@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
}
let csrs = callee_saved_regs_used(isa, func);
// The reserved stack area is composed of:
// return address + frame pointer + all callee-saved registers + shadow space
// return address + frame pointer + all callee-saved registers
//
// Pushing the return address is an implicit function of the `call`
// instruction. Each of the others we will then push explicitly. Then we
// will adjust the stack pointer to make room for the rest of the required
// space for this frame.
let word_size = isa.pointer_bytes() as usize;
let num_fprs = csrs.iter(FPR).len();
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
let csrs = callee_saved_regs_used(isa, func);
let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
// Only create an FPR stack slot if we're going to save FPRs.
let fpr_slot = if num_fprs > 0 {
// Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
// seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
// through `stack_load` and `stack_store` (see later comment about issue #1198). Even
// though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
// implies that `SpillSlot` may be eligible for certain optimizations, and we know with
// certainty that this space may not be reused in the function, nor moved around.
Some(func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::ExplicitSlot,
size: (num_fprs * types::F64X2.bytes() as usize) as u32,
offset: None,
}))
} else {
None
};
// FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
if fpsr_stack_size > 0 {
csr_stack_size = (csr_stack_size + 15) & !15;
}
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
// since cranelift does not support spill slots before incoming args
func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::IncomingArg,
size: csr_stack_size as u32,
offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
size: csr_stack_size,
offset: Some(-(csr_stack_size as StackOffset)),
});
let is_leaf = func.is_leaf();
// If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
if !is_leaf {
// TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::ExplicitSlot,
size: WIN_SHADOW_STACK_SPACE,
offset: None,
});
}
let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
// Subtract the GPR saved register size from the local size because pushes are used for the saves
let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
// Add CSRs to function signature
let reg_type = isa.pointer_type();
let sp_arg_index = if fpsr_stack_size > 0 {
let sp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::CalleeSaved,
RU::rsp as RegUnit,
);
let index = func.signature.params.len();
func.signature.params.push(sp_arg);
Some(index)
} else {
None
};
let fp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::FramePointer,
@@ -601,19 +609,13 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
local_stack_size,
reg_type,
&csrs,
fpr_slot.as_ref(),
sp_arg_index.is_some(),
isa,
);
// Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_common_epilogues(
&mut pos,
local_stack_size,
reg_type,
&csrs,
fpr_slot.as_ref(),
);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
Ok(())
}
@@ -649,14 +651,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
// Add CSRs to function signature
let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
if isa.pointer_bits() == 32 {
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
// to extract vmctx from the stack before we can save the frame pointer.
let sp_arg_index = if isa.pointer_bits() == 32 {
let sp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::CalleeSaved,
RU::rsp as RegUnit,
);
let index = func.signature.params.len();
func.signature.params.push(sp_arg);
}
Some(index)
} else {
None
};
let fp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::FramePointer,
@@ -674,11 +682,18 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
// Set up the cursor and insert the prologue
let entry_block = func.layout.entry_block().expect("missing entry block");
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
insert_common_prologue(
&mut pos,
local_stack_size,
reg_type,
&csrs,
sp_arg_index.is_some(),
isa,
);
// Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, sp_arg_index);
Ok(())
}
@@ -690,12 +705,10 @@ fn insert_common_prologue(
stack_size: i64,
reg_type: ir::types::Type,
csrs: &RegisterSet,
fpr_slot: Option<&StackSlot>,
has_sp_param: bool,
isa: &dyn TargetIsa,
) {
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
// to extract vmctx from the stack before we can save the frame pointer.
let sp = if isa.pointer_bits() == 32 {
let sp = if has_sp_param {
let block = pos.current_block().expect("missing block under cursor");
let sp = pos.func.dfg.append_block_param(block, reg_type);
pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
@@ -799,38 +812,27 @@ fn insert_common_prologue(
}
}
// Now that RSP is prepared for the function, we can use stack slots:
// With the stack pointer adjusted, save any callee-saved floating point registers via offset
// FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
let mut last_fpr_save = None;
if let Some(fpr_slot) = fpr_slot {
debug_assert!(csrs.iter(FPR).len() != 0);
// `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
// address. We are well after postopt could run, so load the CSR region base once here,
// instead of hoping that the addr/store will be combined later.
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
for (i, reg) in csrs.iter(FPR).enumerate() {
// Append param to entry block
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
// function entry.
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
// Since regalloc has already run, we must assign a location.
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
let mut fpr_offset = 0;
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+ (stack_size % types::F64X2.bytes() as i64);
for reg in csrs.iter(FPR) {
// Append param to entry Block
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
// Since regalloc has already run, we must assign a location.
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
last_fpr_save =
Some(
pos.ins()
.store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
);
fpr_offset += types::F64X2.bytes() as i32;
}
last_fpr_save = Some(pos.ins().store(
ir::MemFlags::trusted(),
csr_arg,
sp.expect("FPR save requires SP param"),
(stack_size - offset) as i32,
));
}
pos.func.prologue_end = Some(
@@ -966,13 +968,13 @@ fn insert_common_epilogues(
stack_size: i64,
reg_type: ir::types::Type,
csrs: &RegisterSet,
fpr_slot: Option<&StackSlot>,
sp_arg_index: Option<usize>,
) {
while let Some(block) = pos.next_block() {
pos.goto_last_inst(block);
if let Some(inst) = pos.current_inst() {
if pos.func.dfg[inst].opcode().is_return() {
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index);
}
}
}
@@ -986,56 +988,8 @@ fn insert_common_epilogue(
pos: &mut EncCursor,
reg_type: ir::types::Type,
csrs: &RegisterSet,
fpr_slot: Option<&StackSlot>,
sp_arg_index: Option<usize>,
) {
// Even though instructions to restore FPRs are inserted first, we have to append them after
// restored GPRs to satisfy parameter order in the return.
let mut restored_fpr_values = Vec::new();
// Restore FPRs before we move RSP and invalidate stack slots.
let mut first_fpr_load = None;
if let Some(fpr_slot) = fpr_slot {
debug_assert!(csrs.iter(FPR).len() != 0);
// `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
// address. We are well after postopt could run, so load the CSR region base once here,
// instead of hoping that the addr/store will be combined later.
//
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
// function exit.
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
let mut fpr_offset = 0;
for reg in csrs.iter(FPR) {
let value = pos.ins().load(
types::F64X2,
ir::MemFlags::trusted(),
stack_addr,
fpr_offset,
);
fpr_offset += types::F64X2.bytes() as i32;
// Unlike GPRs before, we don't need to step back after reach restoration because FPR
// restoration is order-insensitive. Furthermore: we want GPR restoration to begin
// after FPR restoration, so that stack adjustments occur after we're done relying on
// StackSlot validity.
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
restored_fpr_values.push(value);
}
}
let mut sp_adjust_inst = None;
if stack_size > 0 {
sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
}
// Insert the pop of the frame pointer
let fp_pop = pos.ins().x86_pop(reg_type);
let fp_pop_inst = pos.prev_inst().unwrap();
@@ -1046,13 +1000,47 @@ fn insert_common_epilogue(
let mut first_csr_pop_inst = None;
for reg in csrs.iter(GPR) {
let csr_pop = pos.ins().x86_pop(reg_type);
first_csr_pop_inst = Some(pos.prev_inst().unwrap());
first_csr_pop_inst = pos.prev_inst();
assert!(first_csr_pop_inst.is_some());
pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
pos.func.dfg.append_inst_arg(inst, csr_pop);
}
for value in restored_fpr_values.into_iter() {
pos.func.dfg.append_inst_arg(inst, value);
// Insert the adjustment of SP
let mut sp_adjust_inst = None;
if stack_size > 0 {
pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
sp_adjust_inst = pos.prev_inst();
assert!(sp_adjust_inst.is_some());
}
let mut first_fpr_load = None;
if let Some(index) = sp_arg_index {
let sp = pos
.func
.dfg
.block_params(pos.func.layout.entry_block().unwrap())[index];
// Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
for (i, reg) in csrs.iter(FPR).enumerate() {
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+ (stack_size % types::F64X2.bytes() as i64);
let value = pos.ins().load(
types::F64X2,
ir::MemFlags::trusted(),
sp,
(stack_size - offset) as i32,
);
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
pos.func.dfg.append_inst_arg(inst, value);
}
} else {
assert!(csrs.iter(FPR).len() == 0);
}
pos.func.epilogues_start.push(

View File

@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
let mut prologue_size = 0;
let mut unwind_codes = Vec::new();
let mut found_end = false;
// Have we saved at least one FPR? if so, we might have to check additional constraints.
let mut saved_fpr = false;
// In addition to the min offset for a callee-save, we need to know the offset from the
// frame base to the stack pointer, so that we can record an unwind offset that spans only
// to the end of callee-save space.
let mut static_frame_allocation_size = 0u32;
// For the time being, FPR preservation is split into a stack_addr and later store/load.
// Store the register used for stack store and ensure it is the same register with no
// intervening changes to the frame size.
let mut callee_save_region_reg = None;
// Also record the callee-save region's offset from RSP, because it must be added to FPR
// save offsets to compute an offset from the frame base.
let mut callee_save_offset = None;
let mut xmm_save_count: u8 = 0;
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
// x64 ABI prologues cannot exceed 255 bytes in length
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
InstructionData::Unary { opcode, arg } => {
match opcode {
Opcode::X86Push => {
static_frame_allocation_size += 8;
unwind_codes.push(UnwindCode::PushRegister {
offset: unwind_offset,
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
Opcode::AdjustSpDown => {
let stack_size =
stack_size.expect("expected a previous stack size instruction");
static_frame_allocation_size += stack_size;
// This is used when calling a stack check function
// We need to track the assignment to RAX which has the size of the stack
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
InstructionData::CopySpecial { src, dst, .. } => {
if let Some(frame_register) = frame_register {
if src == (RU::rsp as RegUnit) && dst == frame_register {
// Constructing an rbp-based stack frame, so the static frame
// allocation restarts at 0 from here.
static_frame_allocation_size = 0;
unwind_codes.push(UnwindCode::SetFramePointer {
offset: unwind_offset,
sp_offset: 0,
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
let imm: i64 = imm.into();
assert!(imm <= core::u32::MAX as i64);
static_frame_allocation_size += imm as u32;
stack_size = Some(imm as u32);
unwind_codes.push(UnwindCode::StackAlloc {
offset: unwind_offset,
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
_ => {}
}
}
InstructionData::StackLoad {
opcode: Opcode::StackAddr,
stack_slot,
offset: _,
} => {
let result = func.dfg.inst_results(inst).get(0).unwrap();
if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
callee_save_region_reg = Some(frame_reg);
// Figure out the offset in the call frame that `frame_reg` will have.
let frame_size = func
.stack_slots
.layout_info
.expect("func's stack slots have layout info if stack operations exist")
.frame_size;
// Because we're well after the prologue has been constructed, stack slots
// must have been laid out...
let slot_offset = func.stack_slots[stack_slot]
.offset
.expect("callee-save slot has an offset computed");
let frame_offset = frame_size as i32 + slot_offset;
callee_save_offset = Some(frame_offset as u32);
}
}
InstructionData::Store {
opcode: Opcode::Store,
args: [arg1, arg2],
flags: _flags,
offset,
..
} => {
if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
(func.locations[arg1], func.locations[arg2])
{
if Some(base_ru) == callee_save_region_reg {
let offset_int: i32 = offset.into();
assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
if FPR.contains(ru) {
saved_fpr = true;
unwind_codes.push(UnwindCode::SaveXmm {
offset: unwind_offset,
reg: ru as u8,
stack_offset: offset_int,
});
}
// If this is a save of an FPR, record an unwind operation
// Note: the stack_offset here is relative to an adjusted SP
// This will be fixed up later to be based on the frame pointer offset
if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
let offset: i32 = offset.into();
unwind_codes.push(UnwindCode::SaveXmm {
offset: unwind_offset,
reg: src as u8,
stack_offset: offset as u32,
});
xmm_save_count += 1;
}
}
}
@@ -183,41 +136,45 @@ pub(crate) fn create_unwind_info(
assert!(found_end);
if saved_fpr {
if static_frame_allocation_size > 240 && saved_fpr {
warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
This is a Cranelift implementation limit, see \
https://github.com/bytecodealliance/wasmtime/issues/1475",
static_frame_allocation_size);
return Err(CodegenError::ImplLimitExceeded);
// When using a frame register, certain unwind operations, such as XMM saves, are relative to the frame
// register minus some offset, forming a "base address". This attempts to calculate the frame register offset
// while updating the XMM save offsets to be relative from this "base address" rather than RSP.
let mut frame_register_offset = 0;
if frame_register.is_some() && xmm_save_count > 0 {
// Determine the number of 16-byte slots used for all CSRs (including GPRs)
// The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
// Assumption: each FPR is stored at a lower address than the previous one
let mut last_stack_offset = None;
let mut fpr_save_count: u8 = 0;
let mut gpr_push_count: u8 = 0;
for code in unwind_codes.iter_mut() {
match code {
UnwindCode::SaveXmm { stack_offset, .. } => {
if let Some(last) = last_stack_offset {
assert!(last > *stack_offset);
}
last_stack_offset = Some(*stack_offset);
fpr_save_count += 1;
*stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
}
UnwindCode::PushRegister { .. } => {
gpr_push_count += 1;
}
_ => {}
}
}
// Only test static frame size is 16-byte aligned when an FPR is saved to avoid
// panicking when alignment is elided because no FPRs are saved and no child calls are
// made.
assert!(
static_frame_allocation_size % 16 == 0,
"static frame allocation must be a multiple of 16"
);
}
assert_eq!(fpr_save_count, xmm_save_count);
// Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
// one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
// Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
// frames are inexpressible, and we cannot actually compile the function. In case there are
// no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
// will actually check it. This, then, avoids panics when compiling functions with large
// call frames.
let reported_frame_offset = if saved_fpr {
(static_frame_allocation_size / 16) as u8
} else {
0
};
// Account for alignment space when there's an odd number of GPR pushes
// Assumption: an FPR (16 bytes) is twice the size of a GPR (8 bytes), hence the (rounded-up) integer division
frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
}
Ok(Some(UnwindInfo {
flags: 0, // this assumes cranelift functions have no SEH handlers
prologue_size: prologue_size as u8,
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
frame_register_offset: reported_frame_offset,
frame_register_offset,
unwind_codes,
}))
}
@@ -284,7 +241,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 9,
size: 64 + 32
size: 64
}
]
}
@@ -303,7 +260,7 @@ mod tests {
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
0x05, // Frame register + offset (RBP with 0 offset)
0x09, // Prolog offset
0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset
@@ -349,7 +306,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 27,
size: 10000 + 32
size: 10000
}
]
}
@@ -369,8 +326,8 @@ mod tests {
0x05, // Frame register + offset (RBP with 0 offset)
0x1B, // Prolog offset
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
0xE6, // Low size byte
0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
0xE2, // Low size byte
0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset
@@ -414,7 +371,7 @@ mod tests {
},
UnwindCode::StackAlloc {
offset: 27,
size: 1000000 + 32
size: 1000000
}
]
}
@@ -434,10 +391,10 @@ mod tests {
0x05, // Frame register + offset (RBP with 0 offset)
0x1B, // Prolog offset
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
0x60, // Byte 1 of size
0x40, // Byte 1 of size
0x42, // Byte 2 of size
0x0F, // Byte 3 of size
0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
0x05, // Prolog offset
0x03, // Operation 3 (save frame register), stack pointer offset = 0
0x02, // Prolog offset

View File

@@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall {
block0(v0: i64):
return
}
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -48
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]):
; nextln: x86_push v1
; nextln: copy_special %rsp -> %rbp
; nextln: v2 = x86_pop.i64
; nextln: return v2
; nextln: }
; check if we still use registers for 4 arguments
function %four_args(i64, i64, i64, i64) windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64):
return
}
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
; check if float arguments are passed through XMM registers
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
block0(v0: f64, v1: f64, v2: f64, v3: f64):
return
}
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
; check if we use stack space for > 4 arguments
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
return
}
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 8, offset 32
; nextln: ss1 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]):
; nextln: x86_push v5
; nextln: copy_special %rsp -> %rbp
; nextln: v6 = x86_pop.i64
; nextln: return v6
; nextln: }
; check that we preserve xmm6 and above if we're using them locally
function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
@@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
[-, %xmm7] v5 = fadd v0, v1
return
}
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
; nextln: ss0 = explicit_slot 32, offset -80
; nextln: ss1 = incoming_arg 16, offset -48
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
; nextln: x86_push v6
; nextln: copy_special %rsp -> %rbp
; nextln: adjust_sp_down_imm 64
; nextln: v7 = stack_addr.i64 ss0
; nextln: store notrap aligned v8, v7
; nextln: store notrap aligned v9, v7+16
; check: v10 = stack_addr.i64 ss0
; nextln: v11 = load.f64x2 notrap aligned v10
; nextln: v12 = load.f64x2 notrap aligned v10+16
; nextln: adjust_sp_up_imm 64
; nextln: v13 = x86_pop.i64
; nextln: v13, v11, v12
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
; nextln: ss0 = incoming_arg 48, offset -48
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
; nextln: x86_push v7
; nextln: copy_special %rsp -> %rbp
; nextln: adjust_sp_down_imm 32
; nextln: store notrap aligned v8, v6+16
; nextln: store notrap aligned v9, v6
; nextln: v11 = load.f64x2 notrap aligned v6+16
; nextln: v12 = load.f64x2 notrap aligned v6
; nextln: adjust_sp_up_imm 32
; nextln: v10 = x86_pop.i64
; nextln: return v10, v11, v12
; nextln: }
function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
block0(v0: i64, v1: f64, v2: i64, v3: f32):
return
}
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: v5 = x86_pop.i64
; nextln: return v5
; nextln: }
function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
block0(v0: f32, v1: f64, v2: i64, v3: i64):
return v1
}
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -16
; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
; nextln: x86_push v4
; nextln: copy_special %rsp -> %rbp
; nextln: regmove v1, %xmm1 -> %xmm0
; nextln: v5 = x86_pop.i64
; nextln: return v1, v5
; nextln: }
function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
fn0 = %foo(i64, i64, i64, i64) -> i64
fn1 = %foo2(i64, i64, i64, i64) -> i64
fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall
fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall
block0(v0: i64):
v1 = load.i64 v0+0
v2 = load.i64 v0+8
@@ -94,3 +135,100 @@ block0(v0: i64):
store.i64 v9, v0+72
return v10
}
; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall {
; nextln: ss0 = spill_slot 8, offset -56
; nextln: ss1 = spill_slot 8, offset -64
; nextln: ss2 = spill_slot 8, offset -72
; nextln: ss3 = spill_slot 8, offset -80
; nextln: ss4 = spill_slot 8, offset -88
; nextln: ss5 = spill_slot 8, offset -96
; nextln: ss6 = spill_slot 8, offset -104
; nextln: ss7 = spill_slot 8, offset -112
; nextln: ss8 = spill_slot 8, offset -120
; nextln: ss9 = spill_slot 8, offset -128
; nextln: ss10 = incoming_arg 48, offset -48
; nextln: ss11 = explicit_slot 32, offset -160
; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
; nextln: fn0 = %foo sig0
; nextln: fn1 = %foo2 sig1
; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]):
; nextln: x86_push v52
; nextln: copy_special %rsp -> %rbp
; nextln: x86_push v53
; nextln: x86_push v54
; nextln: x86_push v55
; nextln: x86_push v56
; nextln: adjust_sp_down_imm 112
; nextln: v0 = spill v11
; nextln: v12 = copy_to_ssa.i64 %rcx
; nextln: v13 = load.i64 v12
; nextln: v1 = spill v13
; nextln: v14 = fill_nop v0
; nextln: v15 = load.i64 v14+8
; nextln: v2 = spill v15
; nextln: v16 = fill_nop v0
; nextln: v17 = load.i64 v16+16
; nextln: v3 = spill v17
; nextln: v18 = fill_nop v0
; nextln: v19 = load.i64 v18+24
; nextln: v4 = spill v19
; nextln: v20 = fill_nop v0
; nextln: v21 = load.i64 v20+32
; nextln: v5 = spill v21
; nextln: v22 = fill_nop v0
; nextln: v23 = load.i64 v22+40
; nextln: v6 = spill v23
; nextln: v24 = fill_nop v0
; nextln: v25 = load.i64 v24+48
; nextln: v7 = spill v25
; nextln: v26 = fill_nop v0
; nextln: v27 = load.i64 v26+56
; nextln: v8 = spill v27
; nextln: v28 = fill_nop v0
; nextln: v29 = load.i64 v28+64
; nextln: v9 = spill v29
; nextln: v30 = fill v1
; nextln: v31 = fill v2
; nextln: v32 = fill v3
; nextln: v33 = fill v4
; nextln: regmove v30, %r15 -> %rcx
; nextln: regmove v31, %r14 -> %rdx
; nextln: regmove v32, %r13 -> %r8
; nextln: regmove v33, %r12 -> %r9
; nextln: v10 = call fn0(v30, v31, v32, v33)
; nextln: v34 = fill v1
; nextln: v35 = fill v0
; nextln: store v34, v35+8
; nextln: v36 = fill v2
; nextln: v37 = fill_nop v0
; nextln: store v36, v37+16
; nextln: v38 = fill v3
; nextln: v39 = fill_nop v0
; nextln: store v38, v39+24
; nextln: v40 = fill v4
; nextln: v41 = fill_nop v0
; nextln: store v40, v41+32
; nextln: v42 = fill v5
; nextln: v43 = fill_nop v0
; nextln: store v42, v43+40
; nextln: v44 = fill v6
; nextln: v45 = fill_nop v0
; nextln: store v44, v45+48
; nextln: v46 = fill v7
; nextln: v47 = fill_nop v0
; nextln: store v46, v47+56
; nextln: v48 = fill v8
; nextln: v49 = fill_nop v0
; nextln: store v48, v49+64
; nextln: v50 = fill v9
; nextln: v51 = fill_nop v0
; nextln: store v50, v51+72
; nextln: adjust_sp_up_imm 112
; nextln: v61 = x86_pop.i64
; nextln: v60 = x86_pop.i64
; nextln: v59 = x86_pop.i64
; nextln: v58 = x86_pop.i64
; nextln: v57 = x86_pop.i64
; nextln: return v10, v57, v58, v59, v60, v61
; nextln: }

View File

@@ -3,13 +3,35 @@ set opt_level=speed_and_size
set is_pic
target x86_64 haswell
; check the unwind information with a function with no args
function %no_args() windows_fastcall {
; check the unwind information with a leaf function with no args
function %no_args_leaf() windows_fastcall {
block0:
return
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 4
; nextln: frame register: 5
; nextln: frame register offset: 0
; nextln: unwind codes: 2
; nextln:
; nextln: offset: 1
; nextln: op: PushNonvolatileRegister
; nextln: info: 5
; nextln:
; nextln: offset: 4
; nextln: op: SetFramePointer
; nextln: info: 0
; check the unwind information with a non-leaf function with no args
function %no_args() windows_fastcall {
fn0 = %foo()
block0:
call fn0()
return
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 8
; nextln: frame register: 5
; nextln: frame register offset: 0
@@ -51,7 +73,7 @@ block0:
; nextln: offset: 17
; nextln: op: LargeStackAlloc
; nextln: info: 0
; nextln: value: 12504 (u16)
; nextln: value: 12500 (u16)
; check a function with large-sized stack alloc
function %large_stack() windows_fastcall {
@@ -77,7 +99,7 @@ block0:
; nextln: offset: 17
; nextln: op: LargeStackAlloc
; nextln: info: 1
; nextln: value: 524320 (u32)
; nextln: value: 524288 (u32)
function %fpr_with_function_call(i64, i64) windows_fastcall {
fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
@@ -113,9 +135,9 @@ block0(v0: i64, v1: i64):
;
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 25
; nextln: prologue size: 22
; nextln: frame register: 5
; nextln: frame register offset: 12
; nextln: frame register offset: 2
; nextln: unwind codes: 5
; nextln:
; nextln: offset: 1
@@ -135,10 +157,10 @@ block0(v0: i64, v1: i64):
; nextln: info: 0
; nextln: value: 23 (u16)
; nextln:
; nextln: offset: 25
; nextln: offset: 22
; nextln: op: SaveXmm128
; nextln: info: 15
; nextln: value: 3 (u16)
; nextln: value: 0 (u16)
; check a function that has CSRs
function %lots_of_registers(i64, i64) windows_fastcall {
@@ -191,9 +213,9 @@ block0(v0: i64, v1: i64):
}
; sameln: version: 1
; nextln: flags: 0
; nextln: prologue size: 41
; nextln: prologue size: 35
; nextln: frame register: 5
; nextln: frame register offset: 10
; nextln: frame register offset: 7
; nextln: unwind codes: 13
; nextln:
; nextln: offset: 1
@@ -234,19 +256,19 @@ block0(v0: i64, v1: i64):
; nextln:
; nextln: offset: 19
; nextln: op: SmallStackAlloc
; nextln: info: 12
; nextln: info: 8
; nextln:
; nextln: offset: 31
; nextln: offset: 24
; nextln: op: SaveXmm128
; nextln: info: 6
; nextln: value: 0 (u16)
; nextln: value: 2 (u16)
; nextln:
; nextln: offset: 36
; nextln: offset: 29
; nextln: op: SaveXmm128
; nextln: info: 7
; nextln: value: 1 (u16)
; nextln:
; nextln: offset: 41
; nextln: offset: 35
; nextln: op: SaveXmm128
; nextln: info: 8
; nextln: value: 2 (u16)
; nextln: value: 0 (u16)