Fix FPR saving and shadow space allocation for Windows x64.
This commit fixes both how FPR callee-saved registers are saved and how the shadow space allocation occurs when laying out the stack for Windows x64 calling convention. Importantly, this commit removes the compiler limitation of stack size for Windows x64 that was imposed because FPR saves previously couldn't always be represented in the unwind information. The FPR saves are now performed without using stack slots, much like how the callee-saved GPRs are saved. The total CSR space is given to `layout_stack` so that it is included in the frame size and to offset the layout of spills and explicit slots. The FPR saves are now done via an RSP offset (post adjustment) and they always follow the GPR saves on the stack. A simpler calculation can now be made to determine the proper offsets of the FPR saves for representing the unwind information. Additionally, the shadow space is no longer treated as an incoming argument, but an explicit stack slot that gets laid out at the lowest address possible in the local frame. This prevents `layout_stack` from putting a spill or explicit slot in this reserved space. In the future, `layout_stack` should take advantage of the *caller-provided* shadow space for spills, but this commit does not attempt to address that. The shadow space is now omitted from the local frame for leaf functions. Fixes #1728. Fixes #1587. Fixes #1475.
This commit is contained in:
@@ -6,7 +6,6 @@ use super::settings as isa_settings;
|
|||||||
use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
|
use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
|
||||||
use crate::cursor::{Cursor, CursorPosition, EncCursor};
|
use crate::cursor::{Cursor, CursorPosition, EncCursor};
|
||||||
use crate::ir;
|
use crate::ir;
|
||||||
use crate::ir::entities::StackSlot;
|
|
||||||
use crate::ir::immediates::Imm64;
|
use crate::ir::immediates::Imm64;
|
||||||
use crate::ir::stackslot::{StackOffset, StackSize};
|
use crate::ir::stackslot::{StackOffset, StackSize};
|
||||||
use crate::ir::types;
|
use crate::ir::types;
|
||||||
@@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
|
|||||||
use crate::result::CodegenResult;
|
use crate::result::CodegenResult;
|
||||||
use crate::stack_layout::layout_stack;
|
use crate::stack_layout::layout_stack;
|
||||||
use alloc::borrow::Cow;
|
use alloc::borrow::Cow;
|
||||||
use alloc::vec::Vec;
|
|
||||||
use core::i32;
|
use core::i32;
|
||||||
use target_lexicon::{PointerWidth, Triple};
|
use target_lexicon::{PointerWidth, Triple};
|
||||||
|
|
||||||
@@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
|
|||||||
///
|
///
|
||||||
/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
|
/// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
|
||||||
/// convention reserves spill space for parameters, you don’t have to use them as such"
|
/// convention reserves spill space for parameters, you don’t have to use them as such"
|
||||||
const WIN_SHADOW_STACK_SPACE: i32 = 32;
|
const WIN_SHADOW_STACK_SPACE: StackSize = 32;
|
||||||
|
|
||||||
/// Stack alignment requirement for functions.
|
/// Stack alignment requirement for functions.
|
||||||
///
|
///
|
||||||
@@ -87,7 +85,7 @@ impl Args {
|
|||||||
WIN_SHADOW_STACK_SPACE
|
WIN_SHADOW_STACK_SPACE
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
} as u32;
|
};
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
pointer_bytes: bits / 8,
|
pointer_bytes: bits / 8,
|
||||||
@@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->
|
|||||||
|
|
||||||
let word_size = StackSize::from(isa.pointer_bytes());
|
let word_size = StackSize::from(isa.pointer_bytes());
|
||||||
let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
|
let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
|
||||||
WIN_SHADOW_STACK_SPACE as u32
|
WIN_SHADOW_STACK_SPACE
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
};
|
};
|
||||||
@@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
|||||||
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
|
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
|
||||||
}
|
}
|
||||||
|
|
||||||
let csrs = callee_saved_regs_used(isa, func);
|
|
||||||
|
|
||||||
// The reserved stack area is composed of:
|
// The reserved stack area is composed of:
|
||||||
// return address + frame pointer + all callee-saved registers + shadow space
|
// return address + frame pointer + all callee-saved registers
|
||||||
//
|
//
|
||||||
// Pushing the return address is an implicit function of the `call`
|
// Pushing the return address is an implicit function of the `call`
|
||||||
// instruction. Each of the others we will then push explicitly. Then we
|
// instruction. Each of the others we will then push explicitly. Then we
|
||||||
// will adjust the stack pointer to make room for the rest of the required
|
// will adjust the stack pointer to make room for the rest of the required
|
||||||
// space for this frame.
|
// space for this frame.
|
||||||
let word_size = isa.pointer_bytes() as usize;
|
let csrs = callee_saved_regs_used(isa, func);
|
||||||
let num_fprs = csrs.iter(FPR).len();
|
let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
|
||||||
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
|
let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
|
||||||
|
let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
|
||||||
|
|
||||||
// Only create an FPR stack slot if we're going to save FPRs.
|
// FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
|
||||||
let fpr_slot = if num_fprs > 0 {
|
if fpsr_stack_size > 0 {
|
||||||
// Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
|
csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32;
|
||||||
// seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
|
}
|
||||||
// through `stack_load` and `stack_store` (see later comment about issue #1198). Even
|
|
||||||
// though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
|
|
||||||
// implies that `SpillSlot` may be eligible for certain optimizations, and we know with
|
|
||||||
// certainty that this space may not be reused in the function, nor moved around.
|
|
||||||
Some(func.create_stack_slot(ir::StackSlotData {
|
|
||||||
kind: ir::StackSlotKind::ExplicitSlot,
|
|
||||||
size: (num_fprs * types::F64X2.bytes() as usize) as u32,
|
|
||||||
offset: None,
|
|
||||||
}))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
|
|
||||||
// since cranelift does not support spill slots before incoming args
|
|
||||||
func.create_stack_slot(ir::StackSlotData {
|
func.create_stack_slot(ir::StackSlotData {
|
||||||
kind: ir::StackSlotKind::IncomingArg,
|
kind: ir::StackSlotKind::IncomingArg,
|
||||||
size: csr_stack_size as u32,
|
size: csr_stack_size,
|
||||||
offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
|
offset: Some(-(csr_stack_size as StackOffset)),
|
||||||
});
|
});
|
||||||
|
|
||||||
let is_leaf = func.is_leaf();
|
let is_leaf = func.is_leaf();
|
||||||
|
|
||||||
|
// If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
|
||||||
|
if !is_leaf {
|
||||||
|
// TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
|
||||||
|
func.create_stack_slot(ir::StackSlotData {
|
||||||
|
kind: ir::StackSlotKind::ExplicitSlot,
|
||||||
|
size: WIN_SHADOW_STACK_SPACE,
|
||||||
|
offset: None,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
|
let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
|
||||||
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
|
|
||||||
|
// Subtract the GPR saved register size from the local size because pushes are used for the saves
|
||||||
|
let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
|
||||||
|
|
||||||
// Add CSRs to function signature
|
// Add CSRs to function signature
|
||||||
let reg_type = isa.pointer_type();
|
let reg_type = isa.pointer_type();
|
||||||
|
let sp_arg_index = if fpsr_stack_size > 0 {
|
||||||
|
let sp_arg = ir::AbiParam::special_reg(
|
||||||
|
reg_type,
|
||||||
|
ir::ArgumentPurpose::CalleeSaved,
|
||||||
|
RU::rsp as RegUnit,
|
||||||
|
);
|
||||||
|
let index = func.signature.params.len();
|
||||||
|
func.signature.params.push(sp_arg);
|
||||||
|
Some(index)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
let fp_arg = ir::AbiParam::special_reg(
|
let fp_arg = ir::AbiParam::special_reg(
|
||||||
reg_type,
|
reg_type,
|
||||||
ir::ArgumentPurpose::FramePointer,
|
ir::ArgumentPurpose::FramePointer,
|
||||||
@@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
|||||||
local_stack_size,
|
local_stack_size,
|
||||||
reg_type,
|
reg_type,
|
||||||
&csrs,
|
&csrs,
|
||||||
fpr_slot.as_ref(),
|
sp_arg_index.is_some(),
|
||||||
isa,
|
isa,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
|||||||
local_stack_size,
|
local_stack_size,
|
||||||
reg_type,
|
reg_type,
|
||||||
&csrs,
|
&csrs,
|
||||||
fpr_slot.as_ref(),
|
sp_arg_index,
|
||||||
|
isa,
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
|||||||
|
|
||||||
// Add CSRs to function signature
|
// Add CSRs to function signature
|
||||||
let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
|
let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
|
||||||
if isa.pointer_bits() == 32 {
|
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
||||||
|
// to extract vmctx from the stack before we can save the frame pointer.
|
||||||
|
let sp_arg_index = if isa.pointer_bits() == 32 {
|
||||||
let sp_arg = ir::AbiParam::special_reg(
|
let sp_arg = ir::AbiParam::special_reg(
|
||||||
reg_type,
|
reg_type,
|
||||||
ir::ArgumentPurpose::CalleeSaved,
|
ir::ArgumentPurpose::CalleeSaved,
|
||||||
RU::rsp as RegUnit,
|
RU::rsp as RegUnit,
|
||||||
);
|
);
|
||||||
|
let index = func.signature.params.len();
|
||||||
func.signature.params.push(sp_arg);
|
func.signature.params.push(sp_arg);
|
||||||
}
|
Some(index)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
let fp_arg = ir::AbiParam::special_reg(
|
let fp_arg = ir::AbiParam::special_reg(
|
||||||
reg_type,
|
reg_type,
|
||||||
ir::ArgumentPurpose::FramePointer,
|
ir::ArgumentPurpose::FramePointer,
|
||||||
@@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
|
|||||||
// Set up the cursor and insert the prologue
|
// Set up the cursor and insert the prologue
|
||||||
let entry_block = func.layout.entry_block().expect("missing entry block");
|
let entry_block = func.layout.entry_block().expect("missing entry block");
|
||||||
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
|
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
|
||||||
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
|
insert_common_prologue(
|
||||||
|
&mut pos,
|
||||||
|
local_stack_size,
|
||||||
|
reg_type,
|
||||||
|
&csrs,
|
||||||
|
sp_arg_index.is_some(),
|
||||||
|
isa,
|
||||||
|
);
|
||||||
|
|
||||||
// Reset the cursor and insert the epilogue
|
// Reset the cursor and insert the epilogue
|
||||||
let mut pos = pos.at_position(CursorPosition::Nowhere);
|
let mut pos = pos.at_position(CursorPosition::Nowhere);
|
||||||
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
|
insert_common_epilogues(
|
||||||
|
&mut pos,
|
||||||
|
local_stack_size,
|
||||||
|
reg_type,
|
||||||
|
&csrs,
|
||||||
|
sp_arg_index,
|
||||||
|
isa,
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -690,12 +719,10 @@ fn insert_common_prologue(
|
|||||||
stack_size: i64,
|
stack_size: i64,
|
||||||
reg_type: ir::types::Type,
|
reg_type: ir::types::Type,
|
||||||
csrs: &RegisterSet,
|
csrs: &RegisterSet,
|
||||||
fpr_slot: Option<&StackSlot>,
|
has_sp_param: bool,
|
||||||
isa: &dyn TargetIsa,
|
isa: &dyn TargetIsa,
|
||||||
) {
|
) {
|
||||||
// On X86-32 all parameters, including vmctx, are passed on stack, and we need
|
let sp = if has_sp_param {
|
||||||
// to extract vmctx from the stack before we can save the frame pointer.
|
|
||||||
let sp = if isa.pointer_bits() == 32 {
|
|
||||||
let block = pos.current_block().expect("missing block under cursor");
|
let block = pos.current_block().expect("missing block under cursor");
|
||||||
let sp = pos.func.dfg.append_block_param(block, reg_type);
|
let sp = pos.func.dfg.append_block_param(block, reg_type);
|
||||||
pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
|
pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
|
||||||
@@ -799,38 +826,27 @@ fn insert_common_prologue(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that RSP is prepared for the function, we can use stack slots:
|
// With the stack pointer adjusted, save any callee-saved floating point registers via offset
|
||||||
|
// FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
|
||||||
let mut last_fpr_save = None;
|
let mut last_fpr_save = None;
|
||||||
if let Some(fpr_slot) = fpr_slot {
|
|
||||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
|
||||||
|
|
||||||
// `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
|
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
// Append param to entry block
|
||||||
// instead of hoping that the addr/store will be combined later.
|
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
||||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
|
||||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
|
||||||
|
|
||||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
// Since regalloc has already run, we must assign a location.
|
||||||
// function entry.
|
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
||||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
|
||||||
|
|
||||||
let mut fpr_offset = 0;
|
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||||
|
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||||
|
+ (stack_size & isa.pointer_bytes() as i64);
|
||||||
|
|
||||||
for reg in csrs.iter(FPR) {
|
last_fpr_save = Some(pos.ins().store(
|
||||||
// Append param to entry Block
|
ir::MemFlags::trusted(),
|
||||||
let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
|
csr_arg,
|
||||||
|
sp.expect("FPR save requires SP param"),
|
||||||
// Since regalloc has already run, we must assign a location.
|
(stack_size - offset) as i32,
|
||||||
pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
|
));
|
||||||
|
|
||||||
last_fpr_save =
|
|
||||||
Some(
|
|
||||||
pos.ins()
|
|
||||||
.store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
|
|
||||||
);
|
|
||||||
|
|
||||||
fpr_offset += types::F64X2.bytes() as i32;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pos.func.prologue_end = Some(
|
pos.func.prologue_end = Some(
|
||||||
@@ -966,13 +982,14 @@ fn insert_common_epilogues(
|
|||||||
stack_size: i64,
|
stack_size: i64,
|
||||||
reg_type: ir::types::Type,
|
reg_type: ir::types::Type,
|
||||||
csrs: &RegisterSet,
|
csrs: &RegisterSet,
|
||||||
fpr_slot: Option<&StackSlot>,
|
sp_arg_index: Option<usize>,
|
||||||
|
isa: &dyn TargetIsa,
|
||||||
) {
|
) {
|
||||||
while let Some(block) = pos.next_block() {
|
while let Some(block) = pos.next_block() {
|
||||||
pos.goto_last_inst(block);
|
pos.goto_last_inst(block);
|
||||||
if let Some(inst) = pos.current_inst() {
|
if let Some(inst) = pos.current_inst() {
|
||||||
if pos.func.dfg[inst].opcode().is_return() {
|
if pos.func.dfg[inst].opcode().is_return() {
|
||||||
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
|
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -986,56 +1003,9 @@ fn insert_common_epilogue(
|
|||||||
pos: &mut EncCursor,
|
pos: &mut EncCursor,
|
||||||
reg_type: ir::types::Type,
|
reg_type: ir::types::Type,
|
||||||
csrs: &RegisterSet,
|
csrs: &RegisterSet,
|
||||||
fpr_slot: Option<&StackSlot>,
|
sp_arg_index: Option<usize>,
|
||||||
|
isa: &dyn TargetIsa,
|
||||||
) {
|
) {
|
||||||
// Even though instructions to restore FPRs are inserted first, we have to append them after
|
|
||||||
// restored GPRs to satisfy parameter order in the return.
|
|
||||||
let mut restored_fpr_values = Vec::new();
|
|
||||||
|
|
||||||
// Restore FPRs before we move RSP and invalidate stack slots.
|
|
||||||
let mut first_fpr_load = None;
|
|
||||||
if let Some(fpr_slot) = fpr_slot {
|
|
||||||
debug_assert!(csrs.iter(FPR).len() != 0);
|
|
||||||
|
|
||||||
// `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
|
|
||||||
// address. We are well after postopt could run, so load the CSR region base once here,
|
|
||||||
// instead of hoping that the addr/store will be combined later.
|
|
||||||
//
|
|
||||||
// See also: https://github.com/bytecodealliance/wasmtime/pull/1198
|
|
||||||
let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
|
|
||||||
|
|
||||||
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
|
||||||
|
|
||||||
// Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
|
|
||||||
// function exit.
|
|
||||||
pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
|
|
||||||
|
|
||||||
let mut fpr_offset = 0;
|
|
||||||
|
|
||||||
for reg in csrs.iter(FPR) {
|
|
||||||
let value = pos.ins().load(
|
|
||||||
types::F64X2,
|
|
||||||
ir::MemFlags::trusted(),
|
|
||||||
stack_addr,
|
|
||||||
fpr_offset,
|
|
||||||
);
|
|
||||||
fpr_offset += types::F64X2.bytes() as i32;
|
|
||||||
|
|
||||||
// Unlike GPRs before, we don't need to step back after reach restoration because FPR
|
|
||||||
// restoration is order-insensitive. Furthermore: we want GPR restoration to begin
|
|
||||||
// after FPR restoration, so that stack adjustments occur after we're done relying on
|
|
||||||
// StackSlot validity.
|
|
||||||
|
|
||||||
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
|
||||||
restored_fpr_values.push(value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut sp_adjust_inst = None;
|
|
||||||
if stack_size > 0 {
|
|
||||||
sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert the pop of the frame pointer
|
// Insert the pop of the frame pointer
|
||||||
let fp_pop = pos.ins().x86_pop(reg_type);
|
let fp_pop = pos.ins().x86_pop(reg_type);
|
||||||
let fp_pop_inst = pos.prev_inst().unwrap();
|
let fp_pop_inst = pos.prev_inst().unwrap();
|
||||||
@@ -1046,13 +1016,47 @@ fn insert_common_epilogue(
|
|||||||
let mut first_csr_pop_inst = None;
|
let mut first_csr_pop_inst = None;
|
||||||
for reg in csrs.iter(GPR) {
|
for reg in csrs.iter(GPR) {
|
||||||
let csr_pop = pos.ins().x86_pop(reg_type);
|
let csr_pop = pos.ins().x86_pop(reg_type);
|
||||||
first_csr_pop_inst = Some(pos.prev_inst().unwrap());
|
first_csr_pop_inst = pos.prev_inst();
|
||||||
|
assert!(first_csr_pop_inst.is_some());
|
||||||
pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
|
pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
|
||||||
pos.func.dfg.append_inst_arg(inst, csr_pop);
|
pos.func.dfg.append_inst_arg(inst, csr_pop);
|
||||||
}
|
}
|
||||||
|
|
||||||
for value in restored_fpr_values.into_iter() {
|
// Insert the adjustment of SP
|
||||||
pos.func.dfg.append_inst_arg(inst, value);
|
let mut sp_adjust_inst = None;
|
||||||
|
if stack_size > 0 {
|
||||||
|
pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
|
||||||
|
sp_adjust_inst = pos.prev_inst();
|
||||||
|
assert!(sp_adjust_inst.is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut first_fpr_load = None;
|
||||||
|
if let Some(index) = sp_arg_index {
|
||||||
|
let sp = pos
|
||||||
|
.func
|
||||||
|
.dfg
|
||||||
|
.block_params(pos.func.layout.entry_block().unwrap())[index];
|
||||||
|
|
||||||
|
// Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
|
||||||
|
for (i, reg) in csrs.iter(FPR).enumerate() {
|
||||||
|
// Offset to where the register is saved relative to RSP, accounting for FPR save alignment
|
||||||
|
let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
|
||||||
|
+ (stack_size & isa.pointer_bytes() as i64);
|
||||||
|
|
||||||
|
let value = pos.ins().load(
|
||||||
|
types::F64X2,
|
||||||
|
ir::MemFlags::trusted(),
|
||||||
|
sp,
|
||||||
|
(stack_size - offset) as i32,
|
||||||
|
);
|
||||||
|
|
||||||
|
first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
|
||||||
|
|
||||||
|
pos.func.locations[value] = ir::ValueLoc::Reg(reg);
|
||||||
|
pos.func.dfg.append_inst_arg(inst, value);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert!(csrs.iter(FPR).len() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
pos.func.epilogues_start.push(
|
pos.func.epilogues_start.push(
|
||||||
|
|||||||
@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
|
|||||||
let mut prologue_size = 0;
|
let mut prologue_size = 0;
|
||||||
let mut unwind_codes = Vec::new();
|
let mut unwind_codes = Vec::new();
|
||||||
let mut found_end = false;
|
let mut found_end = false;
|
||||||
|
let mut xmm_save_count: u8 = 0;
|
||||||
// Have we saved at least one FPR? if so, we might have to check additional constraints.
|
|
||||||
let mut saved_fpr = false;
|
|
||||||
|
|
||||||
// In addition to the min offset for a callee-save, we need to know the offset from the
|
|
||||||
// frame base to the stack pointer, so that we can record an unwind offset that spans only
|
|
||||||
// to the end of callee-save space.
|
|
||||||
let mut static_frame_allocation_size = 0u32;
|
|
||||||
|
|
||||||
// For the time being, FPR preservation is split into a stack_addr and later store/load.
|
|
||||||
// Store the register used for stack store and ensure it is the same register with no
|
|
||||||
// intervening changes to the frame size.
|
|
||||||
let mut callee_save_region_reg = None;
|
|
||||||
// Also record the callee-save region's offset from RSP, because it must be added to FPR
|
|
||||||
// save offsets to compute an offset from the frame base.
|
|
||||||
let mut callee_save_offset = None;
|
|
||||||
|
|
||||||
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
|
for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
|
||||||
// x64 ABI prologues cannot exceed 255 bytes in length
|
// x64 ABI prologues cannot exceed 255 bytes in length
|
||||||
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
|
|||||||
InstructionData::Unary { opcode, arg } => {
|
InstructionData::Unary { opcode, arg } => {
|
||||||
match opcode {
|
match opcode {
|
||||||
Opcode::X86Push => {
|
Opcode::X86Push => {
|
||||||
static_frame_allocation_size += 8;
|
|
||||||
|
|
||||||
unwind_codes.push(UnwindCode::PushRegister {
|
unwind_codes.push(UnwindCode::PushRegister {
|
||||||
offset: unwind_offset,
|
offset: unwind_offset,
|
||||||
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
|
reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
|
||||||
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
|
|||||||
Opcode::AdjustSpDown => {
|
Opcode::AdjustSpDown => {
|
||||||
let stack_size =
|
let stack_size =
|
||||||
stack_size.expect("expected a previous stack size instruction");
|
stack_size.expect("expected a previous stack size instruction");
|
||||||
static_frame_allocation_size += stack_size;
|
|
||||||
|
|
||||||
// This is used when calling a stack check function
|
// This is used when calling a stack check function
|
||||||
// We need to track the assignment to RAX which has the size of the stack
|
// We need to track the assignment to RAX which has the size of the stack
|
||||||
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
|
|||||||
InstructionData::CopySpecial { src, dst, .. } => {
|
InstructionData::CopySpecial { src, dst, .. } => {
|
||||||
if let Some(frame_register) = frame_register {
|
if let Some(frame_register) = frame_register {
|
||||||
if src == (RU::rsp as RegUnit) && dst == frame_register {
|
if src == (RU::rsp as RegUnit) && dst == frame_register {
|
||||||
// Constructing an rbp-based stack frame, so the static frame
|
|
||||||
// allocation restarts at 0 from here.
|
|
||||||
static_frame_allocation_size = 0;
|
|
||||||
|
|
||||||
unwind_codes.push(UnwindCode::SetFramePointer {
|
unwind_codes.push(UnwindCode::SetFramePointer {
|
||||||
offset: unwind_offset,
|
offset: unwind_offset,
|
||||||
sp_offset: 0,
|
sp_offset: 0,
|
||||||
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
|
|||||||
let imm: i64 = imm.into();
|
let imm: i64 = imm.into();
|
||||||
assert!(imm <= core::u32::MAX as i64);
|
assert!(imm <= core::u32::MAX as i64);
|
||||||
|
|
||||||
static_frame_allocation_size += imm as u32;
|
stack_size = Some(imm as u32);
|
||||||
|
|
||||||
unwind_codes.push(UnwindCode::StackAlloc {
|
unwind_codes.push(UnwindCode::StackAlloc {
|
||||||
offset: unwind_offset,
|
offset: unwind_offset,
|
||||||
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
|
|||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
InstructionData::StackLoad {
|
|
||||||
opcode: Opcode::StackAddr,
|
|
||||||
stack_slot,
|
|
||||||
offset: _,
|
|
||||||
} => {
|
|
||||||
let result = func.dfg.inst_results(inst).get(0).unwrap();
|
|
||||||
if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
|
|
||||||
callee_save_region_reg = Some(frame_reg);
|
|
||||||
|
|
||||||
// Figure out the offset in the call frame that `frame_reg` will have.
|
|
||||||
let frame_size = func
|
|
||||||
.stack_slots
|
|
||||||
.layout_info
|
|
||||||
.expect("func's stack slots have layout info if stack operations exist")
|
|
||||||
.frame_size;
|
|
||||||
// Because we're well after the prologue has been constructed, stack slots
|
|
||||||
// must have been laid out...
|
|
||||||
let slot_offset = func.stack_slots[stack_slot]
|
|
||||||
.offset
|
|
||||||
.expect("callee-save slot has an offset computed");
|
|
||||||
let frame_offset = frame_size as i32 + slot_offset;
|
|
||||||
|
|
||||||
callee_save_offset = Some(frame_offset as u32);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
InstructionData::Store {
|
InstructionData::Store {
|
||||||
opcode: Opcode::Store,
|
opcode: Opcode::Store,
|
||||||
args: [arg1, arg2],
|
args: [arg1, arg2],
|
||||||
flags: _flags,
|
|
||||||
offset,
|
offset,
|
||||||
|
..
|
||||||
} => {
|
} => {
|
||||||
if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
|
if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
|
||||||
(func.locations[arg1], func.locations[arg2])
|
(func.locations[arg1], func.locations[arg2])
|
||||||
{
|
{
|
||||||
if Some(base_ru) == callee_save_region_reg {
|
// If this is a save of an FPR, record an unwind operation
|
||||||
let offset_int: i32 = offset.into();
|
// Note: the stack_offset here is relative to an adjusted SP
|
||||||
assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
|
// This will be fixed up later to be based on the frame pointer offset
|
||||||
let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
|
if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
|
||||||
if FPR.contains(ru) {
|
let offset: i32 = offset.into();
|
||||||
saved_fpr = true;
|
unwind_codes.push(UnwindCode::SaveXmm {
|
||||||
unwind_codes.push(UnwindCode::SaveXmm {
|
offset: unwind_offset,
|
||||||
offset: unwind_offset,
|
reg: src as u8,
|
||||||
reg: ru as u8,
|
stack_offset: offset as u32,
|
||||||
stack_offset: offset_int,
|
});
|
||||||
});
|
|
||||||
}
|
xmm_save_count += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info(
|
|||||||
|
|
||||||
assert!(found_end);
|
assert!(found_end);
|
||||||
|
|
||||||
if saved_fpr {
|
let mut frame_register_offset = 0;
|
||||||
if static_frame_allocation_size > 240 && saved_fpr {
|
if xmm_save_count > 0 {
|
||||||
warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
|
// If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs)
|
||||||
This is a Cranelift implementation limit, see \
|
// The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
|
||||||
https://github.com/bytecodealliance/wasmtime/issues/1475",
|
// Assumption: each FPR is stored at a lower address than the previous one
|
||||||
static_frame_allocation_size);
|
let mut last_stack_offset = None;
|
||||||
return Err(CodegenError::ImplLimitExceeded);
|
let mut fpr_save_count: u8 = 0;
|
||||||
|
let mut gpr_push_count: u8 = 0;
|
||||||
|
for code in unwind_codes.iter_mut() {
|
||||||
|
match code {
|
||||||
|
UnwindCode::SaveXmm { stack_offset, .. } => {
|
||||||
|
if let Some(last) = last_stack_offset {
|
||||||
|
assert!(last > *stack_offset);
|
||||||
|
}
|
||||||
|
last_stack_offset = Some(*stack_offset);
|
||||||
|
fpr_save_count += 1;
|
||||||
|
*stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
|
||||||
|
}
|
||||||
|
UnwindCode::PushRegister { .. } => {
|
||||||
|
gpr_push_count += 1;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Only test static frame size is 16-byte aligned when an FPR is saved to avoid
|
assert_eq!(fpr_save_count, xmm_save_count);
|
||||||
// panicking when alignment is elided because no FPRs are saved and no child calls are
|
|
||||||
// made.
|
|
||||||
assert!(
|
|
||||||
static_frame_allocation_size % 16 == 0,
|
|
||||||
"static frame allocation must be a multiple of 16"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
|
// Account for alignment space when there's an odd number of GPR pushes
|
||||||
// one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
|
frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
|
||||||
// Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
|
}
|
||||||
// frames are inexpressible, and we cannot actually compile the function. In case there are
|
|
||||||
// no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
|
|
||||||
// will actually check it. This, then, avoids panics when compiling functions with large
|
|
||||||
// call frames.
|
|
||||||
let reported_frame_offset = if saved_fpr {
|
|
||||||
(static_frame_allocation_size / 16) as u8
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Some(UnwindInfo {
|
Ok(Some(UnwindInfo {
|
||||||
flags: 0, // this assumes cranelift functions have no SEH handlers
|
flags: 0, // this assumes cranelift functions have no SEH handlers
|
||||||
prologue_size: prologue_size as u8,
|
prologue_size: prologue_size as u8,
|
||||||
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
|
frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
|
||||||
frame_register_offset: reported_frame_offset,
|
frame_register_offset,
|
||||||
unwind_codes,
|
unwind_codes,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -284,7 +237,7 @@ mod tests {
|
|||||||
},
|
},
|
||||||
UnwindCode::StackAlloc {
|
UnwindCode::StackAlloc {
|
||||||
offset: 9,
|
offset: 9,
|
||||||
size: 64 + 32
|
size: 64
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -303,7 +256,7 @@ mod tests {
|
|||||||
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
|
0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
|
||||||
0x05, // Frame register + offset (RBP with 0 offset)
|
0x05, // Frame register + offset (RBP with 0 offset)
|
||||||
0x09, // Prolog offset
|
0x09, // Prolog offset
|
||||||
0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
|
0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
|
||||||
0x05, // Prolog offset
|
0x05, // Prolog offset
|
||||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||||
0x02, // Prolog offset
|
0x02, // Prolog offset
|
||||||
@@ -349,7 +302,7 @@ mod tests {
|
|||||||
},
|
},
|
||||||
UnwindCode::StackAlloc {
|
UnwindCode::StackAlloc {
|
||||||
offset: 27,
|
offset: 27,
|
||||||
size: 10000 + 32
|
size: 10000
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -369,8 +322,8 @@ mod tests {
|
|||||||
0x05, // Frame register + offset (RBP with 0 offset)
|
0x05, // Frame register + offset (RBP with 0 offset)
|
||||||
0x1B, // Prolog offset
|
0x1B, // Prolog offset
|
||||||
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
|
0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
|
||||||
0xE6, // Low size byte
|
0xE2, // Low size byte
|
||||||
0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
|
0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
|
||||||
0x05, // Prolog offset
|
0x05, // Prolog offset
|
||||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||||
0x02, // Prolog offset
|
0x02, // Prolog offset
|
||||||
@@ -414,7 +367,7 @@ mod tests {
|
|||||||
},
|
},
|
||||||
UnwindCode::StackAlloc {
|
UnwindCode::StackAlloc {
|
||||||
offset: 27,
|
offset: 27,
|
||||||
size: 1000000 + 32
|
size: 1000000
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -434,10 +387,10 @@ mod tests {
|
|||||||
0x05, // Frame register + offset (RBP with 0 offset)
|
0x05, // Frame register + offset (RBP with 0 offset)
|
||||||
0x1B, // Prolog offset
|
0x1B, // Prolog offset
|
||||||
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
|
0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
|
||||||
0x60, // Byte 1 of size
|
0x40, // Byte 1 of size
|
||||||
0x42, // Byte 2 of size
|
0x42, // Byte 2 of size
|
||||||
0x0F, // Byte 3 of size
|
0x0F, // Byte 3 of size
|
||||||
0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
|
0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
|
||||||
0x05, // Prolog offset
|
0x05, // Prolog offset
|
||||||
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
0x03, // Operation 3 (save frame register), stack pointer offset = 0
|
||||||
0x02, // Prolog offset
|
0x02, // Prolog offset
|
||||||
|
|||||||
@@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall {
|
|||||||
block0(v0: i64):
|
block0(v0: i64):
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||||
; nextln: ss0 = incoming_arg 16, offset -48
|
; nextln: ss0 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: i64 [%rcx], v1: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v1
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: v2 = x86_pop.i64
|
||||||
|
; nextln: return v2
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
; check if we still use registers for 4 arguments
|
; check if we still use registers for 4 arguments
|
||||||
function %four_args(i64, i64, i64, i64) windows_fastcall {
|
function %four_args(i64, i64, i64, i64) windows_fastcall {
|
||||||
block0(v0: i64, v1: i64, v2: i64, v3: i64):
|
block0(v0: i64, v1: i64, v2: i64, v3: i64):
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||||
|
; nextln: ss0 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v4
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: v5 = x86_pop.i64
|
||||||
|
; nextln: return v5
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
; check if float arguments are passed through XMM registers
|
; check if float arguments are passed through XMM registers
|
||||||
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
|
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
|
||||||
block0(v0: f64, v1: f64, v2: f64, v3: f64):
|
block0(v0: f64, v1: f64, v2: f64, v3: f64):
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||||
|
; nextln: ss0 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v4
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: v5 = x86_pop.i64
|
||||||
|
; nextln: return v5
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
; check if we use stack space for > 4 arguments
|
; check if we use stack space for > 4 arguments
|
||||||
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
|
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
|
||||||
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
|
block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||||
|
; nextln: ss0 = incoming_arg 8, offset 32
|
||||||
|
; nextln: ss1 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v5
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: v6 = x86_pop.i64
|
||||||
|
; nextln: return v6
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
; check that we preserve xmm6 and above if we're using them locally
|
; check that we preserve xmm6 and above if we're using them locally
|
||||||
function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
|
function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
|
||||||
@@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
|
|||||||
[-, %xmm7] v5 = fadd v0, v1
|
[-, %xmm7] v5 = fadd v0, v1
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
|
; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
|
||||||
; nextln: ss0 = explicit_slot 32, offset -80
|
; nextln: ss0 = incoming_arg 48, offset -48
|
||||||
; nextln: ss1 = incoming_arg 16, offset -48
|
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
|
||||||
; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
|
; nextln: x86_push v7
|
||||||
; nextln: x86_push v6
|
; nextln: copy_special %rsp -> %rbp
|
||||||
; nextln: copy_special %rsp -> %rbp
|
; nextln: adjust_sp_down_imm 32
|
||||||
; nextln: adjust_sp_down_imm 64
|
; nextln: store notrap aligned v8, v6+16
|
||||||
; nextln: v7 = stack_addr.i64 ss0
|
; nextln: store notrap aligned v9, v6
|
||||||
; nextln: store notrap aligned v8, v7
|
; nextln: v11 = load.f64x2 notrap aligned v6+16
|
||||||
; nextln: store notrap aligned v9, v7+16
|
; nextln: v12 = load.f64x2 notrap aligned v6
|
||||||
; check: v10 = stack_addr.i64 ss0
|
; nextln: adjust_sp_up_imm 32
|
||||||
; nextln: v11 = load.f64x2 notrap aligned v10
|
; nextln: v10 = x86_pop.i64
|
||||||
; nextln: v12 = load.f64x2 notrap aligned v10+16
|
; nextln: return v10, v11, v12
|
||||||
; nextln: adjust_sp_up_imm 64
|
; nextln: }
|
||||||
; nextln: v13 = x86_pop.i64
|
|
||||||
; nextln: v13, v11, v12
|
|
||||||
|
|
||||||
function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
|
function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
|
||||||
block0(v0: i64, v1: f64, v2: i64, v3: f32):
|
block0(v0: i64, v1: f64, v2: i64, v3: f32):
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
|
||||||
|
; nextln: ss0 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v4
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: v5 = x86_pop.i64
|
||||||
|
; nextln: return v5
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
|
function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
|
||||||
block0(v0: f32, v1: f64, v2: i64, v3: i64):
|
block0(v0: f32, v1: f64, v2: i64, v3: i64):
|
||||||
return v1
|
return v1
|
||||||
}
|
}
|
||||||
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
|
; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
|
||||||
|
; nextln: ss0 = incoming_arg 16, offset -16
|
||||||
|
; check: block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
|
||||||
|
; nextln: x86_push v4
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: regmove v1, %xmm1 -> %xmm0
|
||||||
|
; nextln: v5 = x86_pop.i64
|
||||||
|
; nextln: return v1, v5
|
||||||
|
; nextln: }
|
||||||
|
|
||||||
function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
|
function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
|
||||||
fn0 = %foo(i64, i64, i64, i64) -> i64
|
fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall
|
||||||
fn1 = %foo2(i64, i64, i64, i64) -> i64
|
fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall
|
||||||
block0(v0: i64):
|
block0(v0: i64):
|
||||||
v1 = load.i64 v0+0
|
v1 = load.i64 v0+0
|
||||||
v2 = load.i64 v0+8
|
v2 = load.i64 v0+8
|
||||||
@@ -94,3 +135,100 @@ block0(v0: i64):
|
|||||||
store.i64 v9, v0+72
|
store.i64 v9, v0+72
|
||||||
return v10
|
return v10
|
||||||
}
|
}
|
||||||
|
; check: function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall {
|
||||||
|
; nextln: ss0 = spill_slot 8, offset -56
|
||||||
|
; nextln: ss1 = spill_slot 8, offset -64
|
||||||
|
; nextln: ss2 = spill_slot 8, offset -72
|
||||||
|
; nextln: ss3 = spill_slot 8, offset -80
|
||||||
|
; nextln: ss4 = spill_slot 8, offset -88
|
||||||
|
; nextln: ss5 = spill_slot 8, offset -96
|
||||||
|
; nextln: ss6 = spill_slot 8, offset -104
|
||||||
|
; nextln: ss7 = spill_slot 8, offset -112
|
||||||
|
; nextln: ss8 = spill_slot 8, offset -120
|
||||||
|
; nextln: ss9 = spill_slot 8, offset -128
|
||||||
|
; nextln: ss10 = incoming_arg 48, offset -48
|
||||||
|
; nextln: ss11 = explicit_slot 32, offset -160
|
||||||
|
; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
|
||||||
|
; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
|
||||||
|
; nextln: fn0 = %foo sig0
|
||||||
|
; nextln: fn1 = %foo2 sig1
|
||||||
|
; check: block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]):
|
||||||
|
; nextln: x86_push v52
|
||||||
|
; nextln: copy_special %rsp -> %rbp
|
||||||
|
; nextln: x86_push v53
|
||||||
|
; nextln: x86_push v54
|
||||||
|
; nextln: x86_push v55
|
||||||
|
; nextln: x86_push v56
|
||||||
|
; nextln: adjust_sp_down_imm 112
|
||||||
|
; nextln: v0 = spill v11
|
||||||
|
; nextln: v12 = copy_to_ssa.i64 %rcx
|
||||||
|
; nextln: v13 = load.i64 v12
|
||||||
|
; nextln: v1 = spill v13
|
||||||
|
; nextln: v14 = fill_nop v0
|
||||||
|
; nextln: v15 = load.i64 v14+8
|
||||||
|
; nextln: v2 = spill v15
|
||||||
|
; nextln: v16 = fill_nop v0
|
||||||
|
; nextln: v17 = load.i64 v16+16
|
||||||
|
; nextln: v3 = spill v17
|
||||||
|
; nextln: v18 = fill_nop v0
|
||||||
|
; nextln: v19 = load.i64 v18+24
|
||||||
|
; nextln: v4 = spill v19
|
||||||
|
; nextln: v20 = fill_nop v0
|
||||||
|
; nextln: v21 = load.i64 v20+32
|
||||||
|
; nextln: v5 = spill v21
|
||||||
|
; nextln: v22 = fill_nop v0
|
||||||
|
; nextln: v23 = load.i64 v22+40
|
||||||
|
; nextln: v6 = spill v23
|
||||||
|
; nextln: v24 = fill_nop v0
|
||||||
|
; nextln: v25 = load.i64 v24+48
|
||||||
|
; nextln: v7 = spill v25
|
||||||
|
; nextln: v26 = fill_nop v0
|
||||||
|
; nextln: v27 = load.i64 v26+56
|
||||||
|
; nextln: v8 = spill v27
|
||||||
|
; nextln: v28 = fill_nop v0
|
||||||
|
; nextln: v29 = load.i64 v28+64
|
||||||
|
; nextln: v9 = spill v29
|
||||||
|
; nextln: v30 = fill v1
|
||||||
|
; nextln: v31 = fill v2
|
||||||
|
; nextln: v32 = fill v3
|
||||||
|
; nextln: v33 = fill v4
|
||||||
|
; nextln: regmove v30, %r15 -> %rcx
|
||||||
|
; nextln: regmove v31, %r14 -> %rdx
|
||||||
|
; nextln: regmove v32, %r13 -> %r8
|
||||||
|
; nextln: regmove v33, %r12 -> %r9
|
||||||
|
; nextln: v10 = call fn0(v30, v31, v32, v33)
|
||||||
|
; nextln: v34 = fill v1
|
||||||
|
; nextln: v35 = fill v0
|
||||||
|
; nextln: store v34, v35+8
|
||||||
|
; nextln: v36 = fill v2
|
||||||
|
; nextln: v37 = fill_nop v0
|
||||||
|
; nextln: store v36, v37+16
|
||||||
|
; nextln: v38 = fill v3
|
||||||
|
; nextln: v39 = fill_nop v0
|
||||||
|
; nextln: store v38, v39+24
|
||||||
|
; nextln: v40 = fill v4
|
||||||
|
; nextln: v41 = fill_nop v0
|
||||||
|
; nextln: store v40, v41+32
|
||||||
|
; nextln: v42 = fill v5
|
||||||
|
; nextln: v43 = fill_nop v0
|
||||||
|
; nextln: store v42, v43+40
|
||||||
|
; nextln: v44 = fill v6
|
||||||
|
; nextln: v45 = fill_nop v0
|
||||||
|
; nextln: store v44, v45+48
|
||||||
|
; nextln: v46 = fill v7
|
||||||
|
; nextln: v47 = fill_nop v0
|
||||||
|
; nextln: store v46, v47+56
|
||||||
|
; nextln: v48 = fill v8
|
||||||
|
; nextln: v49 = fill_nop v0
|
||||||
|
; nextln: store v48, v49+64
|
||||||
|
; nextln: v50 = fill v9
|
||||||
|
; nextln: v51 = fill_nop v0
|
||||||
|
; nextln: store v50, v51+72
|
||||||
|
; nextln: adjust_sp_up_imm 112
|
||||||
|
; nextln: v61 = x86_pop.i64
|
||||||
|
; nextln: v60 = x86_pop.i64
|
||||||
|
; nextln: v59 = x86_pop.i64
|
||||||
|
; nextln: v58 = x86_pop.i64
|
||||||
|
; nextln: v57 = x86_pop.i64
|
||||||
|
; nextln: return v10, v57, v58, v59, v60, v61
|
||||||
|
; nextln: }
|
||||||
@@ -3,13 +3,35 @@ set opt_level=speed_and_size
|
|||||||
set is_pic
|
set is_pic
|
||||||
target x86_64 haswell
|
target x86_64 haswell
|
||||||
|
|
||||||
; check the unwind information with a function with no args
|
; check the unwind information with a leaf function with no args
|
||||||
function %no_args() windows_fastcall {
|
function %no_args_leaf() windows_fastcall {
|
||||||
block0:
|
block0:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
; sameln: version: 1
|
; sameln: version: 1
|
||||||
; nextln: flags: 0
|
; nextln: flags: 0
|
||||||
|
; nextln: prologue size: 4
|
||||||
|
; nextln: frame register: 5
|
||||||
|
; nextln: frame register offset: 0
|
||||||
|
; nextln: unwind codes: 2
|
||||||
|
; nextln:
|
||||||
|
; nextln: offset: 1
|
||||||
|
; nextln: op: PushNonvolatileRegister
|
||||||
|
; nextln: info: 5
|
||||||
|
; nextln:
|
||||||
|
; nextln: offset: 4
|
||||||
|
; nextln: op: SetFramePointer
|
||||||
|
; nextln: info: 0
|
||||||
|
|
||||||
|
; check the unwind information with a non-leaf function with no args
|
||||||
|
function %no_args() windows_fastcall {
|
||||||
|
fn0 = %foo()
|
||||||
|
block0:
|
||||||
|
call fn0()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
; sameln: version: 1
|
||||||
|
; nextln: flags: 0
|
||||||
; nextln: prologue size: 8
|
; nextln: prologue size: 8
|
||||||
; nextln: frame register: 5
|
; nextln: frame register: 5
|
||||||
; nextln: frame register offset: 0
|
; nextln: frame register offset: 0
|
||||||
@@ -51,7 +73,7 @@ block0:
|
|||||||
; nextln: offset: 17
|
; nextln: offset: 17
|
||||||
; nextln: op: LargeStackAlloc
|
; nextln: op: LargeStackAlloc
|
||||||
; nextln: info: 0
|
; nextln: info: 0
|
||||||
; nextln: value: 12504 (u16)
|
; nextln: value: 12500 (u16)
|
||||||
|
|
||||||
; check a function with large-sized stack alloc
|
; check a function with large-sized stack alloc
|
||||||
function %large_stack() windows_fastcall {
|
function %large_stack() windows_fastcall {
|
||||||
@@ -77,7 +99,7 @@ block0:
|
|||||||
; nextln: offset: 17
|
; nextln: offset: 17
|
||||||
; nextln: op: LargeStackAlloc
|
; nextln: op: LargeStackAlloc
|
||||||
; nextln: info: 1
|
; nextln: info: 1
|
||||||
; nextln: value: 524320 (u32)
|
; nextln: value: 524288 (u32)
|
||||||
|
|
||||||
function %fpr_with_function_call(i64, i64) windows_fastcall {
|
function %fpr_with_function_call(i64, i64) windows_fastcall {
|
||||||
fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
|
fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
|
||||||
@@ -113,9 +135,9 @@ block0(v0: i64, v1: i64):
|
|||||||
;
|
;
|
||||||
; sameln: version: 1
|
; sameln: version: 1
|
||||||
; nextln: flags: 0
|
; nextln: flags: 0
|
||||||
; nextln: prologue size: 25
|
; nextln: prologue size: 22
|
||||||
; nextln: frame register: 5
|
; nextln: frame register: 5
|
||||||
; nextln: frame register offset: 12
|
; nextln: frame register offset: 2
|
||||||
; nextln: unwind codes: 5
|
; nextln: unwind codes: 5
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 1
|
; nextln: offset: 1
|
||||||
@@ -135,10 +157,10 @@ block0(v0: i64, v1: i64):
|
|||||||
; nextln: info: 0
|
; nextln: info: 0
|
||||||
; nextln: value: 23 (u16)
|
; nextln: value: 23 (u16)
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 25
|
; nextln: offset: 22
|
||||||
; nextln: op: SaveXmm128
|
; nextln: op: SaveXmm128
|
||||||
; nextln: info: 15
|
; nextln: info: 15
|
||||||
; nextln: value: 3 (u16)
|
; nextln: value: 0 (u16)
|
||||||
|
|
||||||
; check a function that has CSRs
|
; check a function that has CSRs
|
||||||
function %lots_of_registers(i64, i64) windows_fastcall {
|
function %lots_of_registers(i64, i64) windows_fastcall {
|
||||||
@@ -191,9 +213,9 @@ block0(v0: i64, v1: i64):
|
|||||||
}
|
}
|
||||||
; sameln: version: 1
|
; sameln: version: 1
|
||||||
; nextln: flags: 0
|
; nextln: flags: 0
|
||||||
; nextln: prologue size: 41
|
; nextln: prologue size: 35
|
||||||
; nextln: frame register: 5
|
; nextln: frame register: 5
|
||||||
; nextln: frame register offset: 10
|
; nextln: frame register offset: 7
|
||||||
; nextln: unwind codes: 13
|
; nextln: unwind codes: 13
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 1
|
; nextln: offset: 1
|
||||||
@@ -234,19 +256,19 @@ block0(v0: i64, v1: i64):
|
|||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 19
|
; nextln: offset: 19
|
||||||
; nextln: op: SmallStackAlloc
|
; nextln: op: SmallStackAlloc
|
||||||
; nextln: info: 12
|
; nextln: info: 8
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 31
|
; nextln: offset: 24
|
||||||
; nextln: op: SaveXmm128
|
; nextln: op: SaveXmm128
|
||||||
; nextln: info: 6
|
; nextln: info: 6
|
||||||
; nextln: value: 0 (u16)
|
; nextln: value: 2 (u16)
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 36
|
; nextln: offset: 29
|
||||||
; nextln: op: SaveXmm128
|
; nextln: op: SaveXmm128
|
||||||
; nextln: info: 7
|
; nextln: info: 7
|
||||||
; nextln: value: 1 (u16)
|
; nextln: value: 1 (u16)
|
||||||
; nextln:
|
; nextln:
|
||||||
; nextln: offset: 41
|
; nextln: offset: 35
|
||||||
; nextln: op: SaveXmm128
|
; nextln: op: SaveXmm128
|
||||||
; nextln: info: 8
|
; nextln: info: 8
|
||||||
; nextln: value: 2 (u16)
|
; nextln: value: 0 (u16)
|
||||||
|
|||||||
Reference in New Issue
Block a user