windows fastcall (x64) call convention (#314)

* initial set of work for windows fastcall (x64) call convention

- call conventions: rename `fastcall` to `windows_fastcall`
- add initial set of filetests
- ensure arguments are written after the shadow space/store (offset-wise)
  The shadow space available before the arguments (range 0..32)
  is not used as spill space yet.

* address review feedback
This commit is contained in:
Steffen Butzer
2018-05-09 20:18:30 +02:00
committed by Dan Gohman
parent 09f883182d
commit 5aa84a744b
9 changed files with 246 additions and 16 deletions

View File

@@ -0,0 +1,34 @@
test compile
set is_64bit
set opt_level=best
set is_pic
isa x86 haswell
; check if for one arg we use the right register
function %one_arg(i64) windows_fastcall {
ebb0(v0: i64):
return
}
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -48
; check if we still use registers for 4 arguments
function %four_args(i64, i64, i64, i64) windows_fastcall {
ebb0(v0: i64, v1: i64, v2: i64, v3: i64):
return
}
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check if float arguments are passed through XMM registers
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
ebb0(v0: f64, v1: f64, v2: f64, v3: f64):
return
}
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check if we use stack space for > 4 arguments
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
ebb0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
return
}
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {

View File

@@ -36,14 +36,22 @@ call_conv = EnumSetting(
- fast: not-ABI-stable convention for best performance
- cold: not-ABI-stable convention for infrequently executed code
- system_v: System V-style convention used on many platforms
- fastcall: Windows "fastcall" convention, also used for x64 and ARM
- windows_fastcall: Windows "fastcall" convention, also used for
x64 and ARM
- baldrdash: SpiderMonkey WebAssembly convention
- probestack: specialized convention for the probestack function
The default calling convention may be overridden by individual
functions.
""",
'fast', 'cold', 'system_v', 'fastcall', 'baldrdash', 'probestack')
'fast',
'cold',
'system_v',
'windows_fastcall',
'baldrdash',
'probestack'
)
# Note that Cretonne doesn't currently need an is_pie flag, because PIE is just
# PIC where symbols can't be pre-empted, which can be expressed with the

View File

@@ -382,7 +382,7 @@ mod tests {
CallConv::Fast,
CallConv::Cold,
CallConv::SystemV,
CallConv::Fastcall,
CallConv::WindowsFastcall,
CallConv::Baldrdash,
]
{

View File

@@ -22,6 +22,12 @@ static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9];
/// Return value registers.
static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx];
/// Argument registers for x86-64, when using windows fastcall
static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9];
/// Return value registers for x86-64, when using windows fastcall
static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
struct Args {
pointer_bytes: u32,
pointer_bits: u16,
@@ -36,6 +42,14 @@ struct Args {
impl Args {
fn new(bits: u16, gpr: &'static [RU], fpr_limit: usize, call_conv: CallConv) -> Self {
let offset = if let CallConv::WindowsFastcall = call_conv {
// [1] "The caller is responsible for allocating space for parameters to the callee,
// and must always allocate sufficient space to store four register parameters"
32
} else {
0
};
Self {
pointer_bytes: u32::from(bits) / 8,
pointer_bits: bits,
@@ -44,7 +58,7 @@ impl Args {
gpr_used: 0,
fpr_limit,
fpr_used: 0,
offset: 0,
offset,
call_conv,
}
}
@@ -120,7 +134,11 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag
if flags.is_64bit() {
bits = 64;
args = Args::new(bits, &ARG_GPRS, 8, sig.call_conv);
args = if sig.call_conv == CallConv::WindowsFastcall {
Args::new(bits, &ARG_GPRS_WIN_FASTCALL_X64[..], 4, sig.call_conv)
} else {
Args::new(bits, &ARG_GPRS[..], 8, sig.call_conv)
};
} else {
bits = 32;
args = Args::new(bits, &[], 0, sig.call_conv);
@@ -128,7 +146,13 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag
legalize_args(&mut sig.params, &mut args);
let mut rets = Args::new(bits, &RET_GPRS, 2, sig.call_conv);
let regs = if sig.call_conv == CallConv::WindowsFastcall {
&RET_GPRS_WIN_FASTCALL_X64[..]
} else {
&RET_GPRS[..]
};
let mut rets = Args::new(bits, regs, 2, sig.call_conv);
legalize_args(&mut sig.returns, &mut rets);
}
@@ -161,7 +185,24 @@ pub fn allocatable_registers(_func: &ir::Function, flags: &shared_settings::Flag
/// Get the set of callee-saved registers.
fn callee_saved_gprs(flags: &shared_settings::Flags) -> &'static [RU] {
if flags.is_64bit() {
&[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15]
if flags.call_conv() == CallConv::WindowsFastcall {
// "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 are considered nonvolatile
// and must be saved and restored by a function that uses them."
// as per https://msdn.microsoft.com/en-us/library/6t169e9c.aspx
// RSP & RSB are not listed below, since they are restored automatically during
// a function call. If that wasn't the case, function calls (RET) would not work.
&[
RU::rbx,
RU::rdi,
RU::rsi,
RU::r12,
RU::r13,
RU::r14,
RU::r15,
]
} else {
&[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15]
}
} else {
&[RU::rbx, RU::rsi, RU::rdi]
}
@@ -215,7 +256,7 @@ pub fn prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::Ct
CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
system_v_prologue_epilogue(func, isa)
}
CallConv::Fastcall => unimplemented!("Windows calling conventions"),
CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa),
CallConv::Baldrdash => baldrdash_prologue_epilogue(func, isa),
CallConv::Probestack => unimplemented!("probestack calling convention"),
}
@@ -240,6 +281,83 @@ pub fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) ->
Ok(())
}
/// Implementation of the fastcall-based Win64 calling convention described at [1]
/// [1] https://msdn.microsoft.com/en-us/library/ms235286.aspx
pub fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult {
if !isa.flags().is_64bit() {
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
}
// [1] "The primary exceptions are the stack pointer and malloc or alloca memory,
// which are aligned to 16 bytes in order to aid performance"
let stack_align = 16;
let word_size = if isa.flags().is_64bit() { 8 } else { 4 };
let reg_type = if isa.flags().is_64bit() {
ir::types::I64
} else {
ir::types::I32
};
let csrs = callee_saved_gprs_used(isa.flags(), func);
// [1] "Space is allocated on the call stack as a shadow store for callees to save"
// This shadow store contains the parameters which are passed through registers (ARG_GPRS)
// and is eventually used by the callee to save & restore the values of the arguments.
//
// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333
// "Although the x64 calling convention reserves spill space for parameters,
// you dont have to use them as such"
//
// The reserved stack area is composed of:
// return address + frame pointer + all callee-saved registers + shadow space
//
// Pushing the return address is an implicit function of the `call`
// instruction. Each of the others we will then push explicitly. Then we
// will adjust the stack pointer to make room for the rest of the required
// space for this frame.
const SHADOW_STORE_SIZE: i32 = 32;
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
// since cretonne does not support spill slots before incoming args
func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::IncomingArg,
size: csr_stack_size as u32,
offset: Some(-(SHADOW_STORE_SIZE + csr_stack_size)),
});
let total_stack_size = layout_stack(&mut func.stack_slots, stack_align)? as i32;
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
// Add CSRs to function signature
let fp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::FramePointer,
RU::rbp as RegUnit,
);
func.signature.params.push(fp_arg);
func.signature.returns.push(fp_arg);
for csr in csrs.iter(GPR) {
let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr);
func.signature.params.push(csr_arg);
func.signature.returns.push(csr_arg);
}
// Set up the cursor and insert the prologue
let entry_ebb = func.layout.entry_block().expect("missing entry block");
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb);
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa);
// Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs);
Ok(())
}
/// Insert a System V-compatible prologue and epilogue.
pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult {
// The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but
@@ -261,7 +379,7 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r
// instruction. Each of the others we will then push explicitly. Then we
// will adjust the stack pointer to make room for the rest of the required
// space for this frame.
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size as usize) as i32;
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::IncomingArg,
size: csr_stack_size as u32,
@@ -289,17 +407,18 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r
// Set up the cursor and insert the prologue
let entry_ebb = func.layout.entry_block().expect("missing entry block");
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb);
insert_system_v_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa);
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa);
// Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_system_v_epilogues(&mut pos, local_stack_size, reg_type, &csrs);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs);
Ok(())
}
/// Insert the prologue for a given function.
fn insert_system_v_prologue(
/// This is used by common calling conventions such as System V.
fn insert_common_prologue(
pos: &mut EncCursor,
stack_size: i64,
reg_type: ir::types::Type,
@@ -374,7 +493,7 @@ fn insert_system_v_prologue(
}
/// Find all `return` instructions and insert epilogues before them.
fn insert_system_v_epilogues(
fn insert_common_epilogues(
pos: &mut EncCursor,
stack_size: i64,
reg_type: ir::types::Type,
@@ -384,14 +503,15 @@ fn insert_system_v_epilogues(
pos.goto_last_inst(ebb);
if let Some(inst) = pos.current_inst() {
if pos.func.dfg[inst].opcode().is_return() {
insert_system_v_epilogue(inst, stack_size, pos, reg_type, csrs);
insert_common_epilogue(inst, stack_size, pos, reg_type, csrs);
}
}
}
}
/// Insert an epilogue given a specific `return` instruction.
fn insert_system_v_epilogue(
/// This is used by common calling conventions such as System V.
fn insert_common_epilogue(
inst: ir::Inst,
stack_size: i64,
pos: &mut EncCursor,

View File

@@ -37,7 +37,7 @@ pub fn builders() -> Result<(settings::Builder, isa::Builder), &'static str> {
if cfg!(any(unix, target_os = "nebulet")) {
flag_builder.set("call_conv", "system_v").unwrap();
} else if cfg!(windows) {
flag_builder.set("call_conv", "fastcall").unwrap();
flag_builder.set("call_conv", "windows_fastcall").unwrap();
} else {
return Err("unrecognized environment");
}

View File

@@ -16,6 +16,9 @@ region = "0.2.0"
libc = { version = "0.2.40", default-features = false }
errno = "0.2.3"
[target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["winbase", "memoryapi"] }
[features]
default = ["std"]
std = ["libc/use_std", "cretonne-codegen/std", "cretonne-module/std", "cretonne-native/std"]

View File

@@ -9,6 +9,8 @@ use cretonne_native;
use std::ffi::CString;
use std::ptr;
use libc;
#[cfg(windows)]
use winapi;
use memory::Memory;
/// A builder for `SimpleJITBackend`.
@@ -344,6 +346,7 @@ impl<'simple_jit_backend> Backend for SimpleJITBackend {
fn finish(self) -> () {}
}
#[cfg(not(windows))]
fn lookup_with_dlsym(name: &str) -> *const u8 {
let c_str = CString::new(name).unwrap();
let c_str_ptr = c_str.as_ptr();
@@ -354,6 +357,38 @@ fn lookup_with_dlsym(name: &str) -> *const u8 {
sym as *const u8
}
#[cfg(windows)]
fn lookup_with_dlsym(name: &str) -> *const u8 {
const MSVCRT_DLL: &[u8] = b"msvcrt.dll\0";
let c_str = CString::new(name).unwrap();
let c_str_ptr = c_str.as_ptr();
unsafe {
let handles = [
// try to find the searched symbol in the currently running executable
ptr::null_mut(),
// try to find the searched symbol in local c runtime
winapi::um::libloaderapi::GetModuleHandleA(MSVCRT_DLL.as_ptr() as *const i8),
];
for handle in &handles {
let addr = winapi::um::libloaderapi::GetProcAddress(*handle, c_str_ptr);
if addr.is_null() {
continue;
}
return addr as *const u8;
}
let msg = if handles[1].is_null() {
"(msvcrt not loaded)"
} else {
""
};
panic!("cannot resolve address of symbol {} {}", name, msg);
}
}
struct SimpleJITRelocSink {
pub relocs: Vec<RelocRecord>,
}

View File

@@ -23,6 +23,9 @@ extern crate errno;
extern crate region;
extern crate libc;
#[cfg(target_os = "windows")]
extern crate winapi;
mod backend;
mod memory;

View File

@@ -26,6 +26,7 @@ impl PtrLen {
/// Create a new `PtrLen` pointing to at least `size` bytes of memory,
/// suitably sized and aligned for memory protection.
#[cfg(not(target_os = "windows"))]
fn with_size(size: usize) -> Result<Self, String> {
let page_size = region::page::size();
let alloc_size = round_up_to_page_size(size, page_size);
@@ -42,6 +43,32 @@ impl PtrLen {
}
}
}
#[cfg(target_os = "windows")]
fn with_size(size: usize) -> Result<Self, String> {
use winapi::um::memoryapi::VirtualAlloc;
use winapi::um::winnt::{MEM_COMMIT, MEM_RESERVE, PAGE_READWRITE};
let page_size = region::page::size();
// VirtualAlloc always rounds up to the next multiple of the page size
let ptr = unsafe {
VirtualAlloc(
ptr::null_mut(),
size,
MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE,
)
};
if !ptr.is_null() {
Ok(Self {
ptr: ptr as *mut u8,
len: round_up_to_page_size(size, page_size),
})
} else {
Err(errno::errno().to_string())
}
}
}
/// JIT memory manager. This manages pages of suitably aligned and