windows fastcall (x64) call convention (#314)

* initial set of work for windows fastcall (x64) call convention

- call conventions: rename `fastcall` to `windows_fastcall`
- add initial set of filetests
- ensure arguments are written after the shadow space/store (offset-wise)
  The shadow space available before the arguments (range 0..32)
  is not used as spill space yet.

* address review feedback
This commit is contained in:
Steffen Butzer
2018-05-09 20:18:30 +02:00
committed by Dan Gohman
parent 09f883182d
commit 5aa84a744b
9 changed files with 246 additions and 16 deletions

View File

@@ -0,0 +1,34 @@
test compile
set is_64bit
set opt_level=best
set is_pic
isa x86 haswell
; check if for one arg we use the right register
function %one_arg(i64) windows_fastcall {
ebb0(v0: i64):
return
}
; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; nextln: ss0 = incoming_arg 16, offset -48
; check if we still use registers for 4 arguments
function %four_args(i64, i64, i64, i64) windows_fastcall {
ebb0(v0: i64, v1: i64, v2: i64, v3: i64):
return
}
; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check if float arguments are passed through XMM registers
function %four_float_args(f64, f64, f64, f64) windows_fastcall {
ebb0(v0: f64, v1: f64, v2: f64, v3: f64):
return
}
; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
; check if we use stack space for > 4 arguments
function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
ebb0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
return
}
; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {

View File

@@ -36,14 +36,22 @@ call_conv = EnumSetting(
- fast: not-ABI-stable convention for best performance - fast: not-ABI-stable convention for best performance
- cold: not-ABI-stable convention for infrequently executed code - cold: not-ABI-stable convention for infrequently executed code
- system_v: System V-style convention used on many platforms - system_v: System V-style convention used on many platforms
- fastcall: Windows "fastcall" convention, also used for x64 and ARM - windows_fastcall: Windows "fastcall" convention, also used for
x64 and ARM
- baldrdash: SpiderMonkey WebAssembly convention - baldrdash: SpiderMonkey WebAssembly convention
- probestack: specialized convention for the probestack function - probestack: specialized convention for the probestack function
The default calling convention may be overridden by individual The default calling convention may be overridden by individual
functions. functions.
""", """,
'fast', 'cold', 'system_v', 'fastcall', 'baldrdash', 'probestack')
'fast',
'cold',
'system_v',
'windows_fastcall',
'baldrdash',
'probestack'
)
# Note that Cretonne doesn't currently need an is_pie flag, because PIE is just # Note that Cretonne doesn't currently need an is_pie flag, because PIE is just
# PIC where symbols can't be pre-empted, which can be expressed with the # PIC where symbols can't be pre-empted, which can be expressed with the

View File

@@ -382,7 +382,7 @@ mod tests {
CallConv::Fast, CallConv::Fast,
CallConv::Cold, CallConv::Cold,
CallConv::SystemV, CallConv::SystemV,
CallConv::Fastcall, CallConv::WindowsFastcall,
CallConv::Baldrdash, CallConv::Baldrdash,
] ]
{ {

View File

@@ -22,6 +22,12 @@ static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9];
/// Return value registers. /// Return value registers.
static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx]; static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx];
/// Argument registers for x86-64, when using windows fastcall
static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9];
/// Return value registers for x86-64, when using windows fastcall
static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
struct Args { struct Args {
pointer_bytes: u32, pointer_bytes: u32,
pointer_bits: u16, pointer_bits: u16,
@@ -36,6 +42,14 @@ struct Args {
impl Args { impl Args {
fn new(bits: u16, gpr: &'static [RU], fpr_limit: usize, call_conv: CallConv) -> Self { fn new(bits: u16, gpr: &'static [RU], fpr_limit: usize, call_conv: CallConv) -> Self {
let offset = if let CallConv::WindowsFastcall = call_conv {
// [1] "The caller is responsible for allocating space for parameters to the callee,
// and must always allocate sufficient space to store four register parameters"
32
} else {
0
};
Self { Self {
pointer_bytes: u32::from(bits) / 8, pointer_bytes: u32::from(bits) / 8,
pointer_bits: bits, pointer_bits: bits,
@@ -44,7 +58,7 @@ impl Args {
gpr_used: 0, gpr_used: 0,
fpr_limit, fpr_limit,
fpr_used: 0, fpr_used: 0,
offset: 0, offset,
call_conv, call_conv,
} }
} }
@@ -120,7 +134,11 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag
if flags.is_64bit() { if flags.is_64bit() {
bits = 64; bits = 64;
args = Args::new(bits, &ARG_GPRS, 8, sig.call_conv); args = if sig.call_conv == CallConv::WindowsFastcall {
Args::new(bits, &ARG_GPRS_WIN_FASTCALL_X64[..], 4, sig.call_conv)
} else {
Args::new(bits, &ARG_GPRS[..], 8, sig.call_conv)
};
} else { } else {
bits = 32; bits = 32;
args = Args::new(bits, &[], 0, sig.call_conv); args = Args::new(bits, &[], 0, sig.call_conv);
@@ -128,7 +146,13 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag
legalize_args(&mut sig.params, &mut args); legalize_args(&mut sig.params, &mut args);
let mut rets = Args::new(bits, &RET_GPRS, 2, sig.call_conv); let regs = if sig.call_conv == CallConv::WindowsFastcall {
&RET_GPRS_WIN_FASTCALL_X64[..]
} else {
&RET_GPRS[..]
};
let mut rets = Args::new(bits, regs, 2, sig.call_conv);
legalize_args(&mut sig.returns, &mut rets); legalize_args(&mut sig.returns, &mut rets);
} }
@@ -161,7 +185,24 @@ pub fn allocatable_registers(_func: &ir::Function, flags: &shared_settings::Flag
/// Get the set of callee-saved registers. /// Get the set of callee-saved registers.
fn callee_saved_gprs(flags: &shared_settings::Flags) -> &'static [RU] { fn callee_saved_gprs(flags: &shared_settings::Flags) -> &'static [RU] {
if flags.is_64bit() { if flags.is_64bit() {
if flags.call_conv() == CallConv::WindowsFastcall {
// "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 are considered nonvolatile
// and must be saved and restored by a function that uses them."
// as per https://msdn.microsoft.com/en-us/library/6t169e9c.aspx
// RSP & RSB are not listed below, since they are restored automatically during
// a function call. If that wasn't the case, function calls (RET) would not work.
&[
RU::rbx,
RU::rdi,
RU::rsi,
RU::r12,
RU::r13,
RU::r14,
RU::r15,
]
} else {
&[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15] &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15]
}
} else { } else {
&[RU::rbx, RU::rsi, RU::rdi] &[RU::rbx, RU::rsi, RU::rdi]
} }
@@ -215,7 +256,7 @@ pub fn prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::Ct
CallConv::Fast | CallConv::Cold | CallConv::SystemV => { CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
system_v_prologue_epilogue(func, isa) system_v_prologue_epilogue(func, isa)
} }
CallConv::Fastcall => unimplemented!("Windows calling conventions"), CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa),
CallConv::Baldrdash => baldrdash_prologue_epilogue(func, isa), CallConv::Baldrdash => baldrdash_prologue_epilogue(func, isa),
CallConv::Probestack => unimplemented!("probestack calling convention"), CallConv::Probestack => unimplemented!("probestack calling convention"),
} }
@@ -240,6 +281,83 @@ pub fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) ->
Ok(()) Ok(())
} }
/// Implementation of the fastcall-based Win64 calling convention described at [1]
/// [1] https://msdn.microsoft.com/en-us/library/ms235286.aspx
pub fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult {
if !isa.flags().is_64bit() {
panic!("TODO: windows-fastcall: x86-32 not implemented yet");
}
// [1] "The primary exceptions are the stack pointer and malloc or alloca memory,
// which are aligned to 16 bytes in order to aid performance"
let stack_align = 16;
let word_size = if isa.flags().is_64bit() { 8 } else { 4 };
let reg_type = if isa.flags().is_64bit() {
ir::types::I64
} else {
ir::types::I32
};
let csrs = callee_saved_gprs_used(isa.flags(), func);
// [1] "Space is allocated on the call stack as a shadow store for callees to save"
// This shadow store contains the parameters which are passed through registers (ARG_GPRS)
// and is eventually used by the callee to save & restore the values of the arguments.
//
// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333
// "Although the x64 calling convention reserves spill space for parameters,
// you dont have to use them as such"
//
// The reserved stack area is composed of:
// return address + frame pointer + all callee-saved registers + shadow space
//
// Pushing the return address is an implicit function of the `call`
// instruction. Each of the others we will then push explicitly. Then we
// will adjust the stack pointer to make room for the rest of the required
// space for this frame.
const SHADOW_STORE_SIZE: i32 = 32;
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
// TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
// since cretonne does not support spill slots before incoming args
func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::IncomingArg,
size: csr_stack_size as u32,
offset: Some(-(SHADOW_STORE_SIZE + csr_stack_size)),
});
let total_stack_size = layout_stack(&mut func.stack_slots, stack_align)? as i32;
let local_stack_size = i64::from(total_stack_size - csr_stack_size);
// Add CSRs to function signature
let fp_arg = ir::AbiParam::special_reg(
reg_type,
ir::ArgumentPurpose::FramePointer,
RU::rbp as RegUnit,
);
func.signature.params.push(fp_arg);
func.signature.returns.push(fp_arg);
for csr in csrs.iter(GPR) {
let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr);
func.signature.params.push(csr_arg);
func.signature.returns.push(csr_arg);
}
// Set up the cursor and insert the prologue
let entry_ebb = func.layout.entry_block().expect("missing entry block");
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb);
insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa);
// Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs);
Ok(())
}
/// Insert a System V-compatible prologue and epilogue. /// Insert a System V-compatible prologue and epilogue.
pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult { pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult {
// The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but // The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but
@@ -261,7 +379,7 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r
// instruction. Each of the others we will then push explicitly. Then we // instruction. Each of the others we will then push explicitly. Then we
// will adjust the stack pointer to make room for the rest of the required // will adjust the stack pointer to make room for the rest of the required
// space for this frame. // space for this frame.
let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size as usize) as i32; let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
func.create_stack_slot(ir::StackSlotData { func.create_stack_slot(ir::StackSlotData {
kind: ir::StackSlotKind::IncomingArg, kind: ir::StackSlotKind::IncomingArg,
size: csr_stack_size as u32, size: csr_stack_size as u32,
@@ -289,17 +407,18 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r
// Set up the cursor and insert the prologue // Set up the cursor and insert the prologue
let entry_ebb = func.layout.entry_block().expect("missing entry block"); let entry_ebb = func.layout.entry_block().expect("missing entry block");
let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb);
insert_system_v_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa);
// Reset the cursor and insert the epilogue // Reset the cursor and insert the epilogue
let mut pos = pos.at_position(CursorPosition::Nowhere); let mut pos = pos.at_position(CursorPosition::Nowhere);
insert_system_v_epilogues(&mut pos, local_stack_size, reg_type, &csrs); insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs);
Ok(()) Ok(())
} }
/// Insert the prologue for a given function. /// Insert the prologue for a given function.
fn insert_system_v_prologue( /// This is used by common calling conventions such as System V.
fn insert_common_prologue(
pos: &mut EncCursor, pos: &mut EncCursor,
stack_size: i64, stack_size: i64,
reg_type: ir::types::Type, reg_type: ir::types::Type,
@@ -374,7 +493,7 @@ fn insert_system_v_prologue(
} }
/// Find all `return` instructions and insert epilogues before them. /// Find all `return` instructions and insert epilogues before them.
fn insert_system_v_epilogues( fn insert_common_epilogues(
pos: &mut EncCursor, pos: &mut EncCursor,
stack_size: i64, stack_size: i64,
reg_type: ir::types::Type, reg_type: ir::types::Type,
@@ -384,14 +503,15 @@ fn insert_system_v_epilogues(
pos.goto_last_inst(ebb); pos.goto_last_inst(ebb);
if let Some(inst) = pos.current_inst() { if let Some(inst) = pos.current_inst() {
if pos.func.dfg[inst].opcode().is_return() { if pos.func.dfg[inst].opcode().is_return() {
insert_system_v_epilogue(inst, stack_size, pos, reg_type, csrs); insert_common_epilogue(inst, stack_size, pos, reg_type, csrs);
} }
} }
} }
} }
/// Insert an epilogue given a specific `return` instruction. /// Insert an epilogue given a specific `return` instruction.
fn insert_system_v_epilogue( /// This is used by common calling conventions such as System V.
fn insert_common_epilogue(
inst: ir::Inst, inst: ir::Inst,
stack_size: i64, stack_size: i64,
pos: &mut EncCursor, pos: &mut EncCursor,

View File

@@ -37,7 +37,7 @@ pub fn builders() -> Result<(settings::Builder, isa::Builder), &'static str> {
if cfg!(any(unix, target_os = "nebulet")) { if cfg!(any(unix, target_os = "nebulet")) {
flag_builder.set("call_conv", "system_v").unwrap(); flag_builder.set("call_conv", "system_v").unwrap();
} else if cfg!(windows) { } else if cfg!(windows) {
flag_builder.set("call_conv", "fastcall").unwrap(); flag_builder.set("call_conv", "windows_fastcall").unwrap();
} else { } else {
return Err("unrecognized environment"); return Err("unrecognized environment");
} }

View File

@@ -16,6 +16,9 @@ region = "0.2.0"
libc = { version = "0.2.40", default-features = false } libc = { version = "0.2.40", default-features = false }
errno = "0.2.3" errno = "0.2.3"
[target.'cfg(target_os = "windows")'.dependencies]
winapi = { version = "0.3", features = ["winbase", "memoryapi"] }
[features] [features]
default = ["std"] default = ["std"]
std = ["libc/use_std", "cretonne-codegen/std", "cretonne-module/std", "cretonne-native/std"] std = ["libc/use_std", "cretonne-codegen/std", "cretonne-module/std", "cretonne-native/std"]

View File

@@ -9,6 +9,8 @@ use cretonne_native;
use std::ffi::CString; use std::ffi::CString;
use std::ptr; use std::ptr;
use libc; use libc;
#[cfg(windows)]
use winapi;
use memory::Memory; use memory::Memory;
/// A builder for `SimpleJITBackend`. /// A builder for `SimpleJITBackend`.
@@ -344,6 +346,7 @@ impl<'simple_jit_backend> Backend for SimpleJITBackend {
fn finish(self) -> () {} fn finish(self) -> () {}
} }
#[cfg(not(windows))]
fn lookup_with_dlsym(name: &str) -> *const u8 { fn lookup_with_dlsym(name: &str) -> *const u8 {
let c_str = CString::new(name).unwrap(); let c_str = CString::new(name).unwrap();
let c_str_ptr = c_str.as_ptr(); let c_str_ptr = c_str.as_ptr();
@@ -354,6 +357,38 @@ fn lookup_with_dlsym(name: &str) -> *const u8 {
sym as *const u8 sym as *const u8
} }
#[cfg(windows)]
fn lookup_with_dlsym(name: &str) -> *const u8 {
const MSVCRT_DLL: &[u8] = b"msvcrt.dll\0";
let c_str = CString::new(name).unwrap();
let c_str_ptr = c_str.as_ptr();
unsafe {
let handles = [
// try to find the searched symbol in the currently running executable
ptr::null_mut(),
// try to find the searched symbol in local c runtime
winapi::um::libloaderapi::GetModuleHandleA(MSVCRT_DLL.as_ptr() as *const i8),
];
for handle in &handles {
let addr = winapi::um::libloaderapi::GetProcAddress(*handle, c_str_ptr);
if addr.is_null() {
continue;
}
return addr as *const u8;
}
let msg = if handles[1].is_null() {
"(msvcrt not loaded)"
} else {
""
};
panic!("cannot resolve address of symbol {} {}", name, msg);
}
}
struct SimpleJITRelocSink { struct SimpleJITRelocSink {
pub relocs: Vec<RelocRecord>, pub relocs: Vec<RelocRecord>,
} }

View File

@@ -23,6 +23,9 @@ extern crate errno;
extern crate region; extern crate region;
extern crate libc; extern crate libc;
#[cfg(target_os = "windows")]
extern crate winapi;
mod backend; mod backend;
mod memory; mod memory;

View File

@@ -26,6 +26,7 @@ impl PtrLen {
/// Create a new `PtrLen` pointing to at least `size` bytes of memory, /// Create a new `PtrLen` pointing to at least `size` bytes of memory,
/// suitably sized and aligned for memory protection. /// suitably sized and aligned for memory protection.
#[cfg(not(target_os = "windows"))]
fn with_size(size: usize) -> Result<Self, String> { fn with_size(size: usize) -> Result<Self, String> {
let page_size = region::page::size(); let page_size = region::page::size();
let alloc_size = round_up_to_page_size(size, page_size); let alloc_size = round_up_to_page_size(size, page_size);
@@ -42,6 +43,32 @@ impl PtrLen {
} }
} }
} }
#[cfg(target_os = "windows")]
fn with_size(size: usize) -> Result<Self, String> {
use winapi::um::memoryapi::VirtualAlloc;
use winapi::um::winnt::{MEM_COMMIT, MEM_RESERVE, PAGE_READWRITE};
let page_size = region::page::size();
// VirtualAlloc always rounds up to the next multiple of the page size
let ptr = unsafe {
VirtualAlloc(
ptr::null_mut(),
size,
MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE,
)
};
if !ptr.is_null() {
Ok(Self {
ptr: ptr as *mut u8,
len: round_up_to_page_size(size, page_size),
})
} else {
Err(errno::errno().to_string())
}
}
} }
/// JIT memory manager. This manages pages of suitably aligned and /// JIT memory manager. This manages pages of suitably aligned and