From 5aa84a744b738a777badf7c892bf8e388c184ede Mon Sep 17 00:00:00 2001 From: Steffen Butzer Date: Wed, 9 May 2018 20:18:30 +0200 Subject: [PATCH] windows fastcall (x64) call convention (#314) * initial set of work for windows fastcall (x64) call convention - call conventions: rename `fastcall` to `windows_fastcall` - add initial set of filetests - ensure arguments are written after the shadow space/store (offset-wise) The shadow space available before the arguments (range 0..32) is not used as spill space yet. * address review feedback --- .../isa/x86/windows_fastcall_x64.cton | 34 +++++ lib/codegen/meta/base/settings.py | 12 +- lib/codegen/src/ir/extfunc.rs | 2 +- lib/codegen/src/isa/x86/abi.rs | 144 ++++++++++++++++-- lib/native/src/lib.rs | 2 +- lib/simplejit/Cargo.toml | 3 + lib/simplejit/src/backend.rs | 35 +++++ lib/simplejit/src/lib.rs | 3 + lib/simplejit/src/memory.rs | 27 ++++ 9 files changed, 246 insertions(+), 16 deletions(-) create mode 100644 cranelift/filetests/isa/x86/windows_fastcall_x64.cton diff --git a/cranelift/filetests/isa/x86/windows_fastcall_x64.cton b/cranelift/filetests/isa/x86/windows_fastcall_x64.cton new file mode 100644 index 0000000000..204bd5bb70 --- /dev/null +++ b/cranelift/filetests/isa/x86/windows_fastcall_x64.cton @@ -0,0 +1,34 @@ +test compile +set is_64bit +set opt_level=best +set is_pic +isa x86 haswell + +; check if for one arg we use the right register +function %one_arg(i64) windows_fastcall { +ebb0(v0: i64): + return +} +; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; nextln: ss0 = incoming_arg 16, offset -48 + +; check if we still use registers for 4 arguments +function %four_args(i64, i64, i64, i64) windows_fastcall { +ebb0(v0: i64, v1: i64, v2: i64, v3: i64): + return +} +; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { + +; check if float arguments are passed through XMM registers +function %four_float_args(f64, f64, f64, f64) windows_fastcall { +ebb0(v0: f64, v1: f64, v2: f64, v3: f64): + return +} +; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { + +; check if we use stack space for > 4 arguments +function %five_args(i64, i64, i64, i64, i64) windows_fastcall { +ebb0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): + return +} +; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { diff --git a/lib/codegen/meta/base/settings.py b/lib/codegen/meta/base/settings.py index 32df3d63b4..a5107e5b4c 100644 --- a/lib/codegen/meta/base/settings.py +++ b/lib/codegen/meta/base/settings.py @@ -36,14 +36,22 @@ call_conv = EnumSetting( - fast: not-ABI-stable convention for best performance - cold: not-ABI-stable convention for infrequently executed code - system_v: System V-style convention used on many platforms - - fastcall: Windows "fastcall" convention, also used for x64 and ARM + - windows_fastcall: Windows "fastcall" convention, also used for + x64 and ARM - baldrdash: SpiderMonkey WebAssembly convention - probestack: specialized convention for the probestack function The default calling convention may be overridden by individual functions. """, - 'fast', 'cold', 'system_v', 'fastcall', 'baldrdash', 'probestack') + + 'fast', + 'cold', + 'system_v', + 'windows_fastcall', + 'baldrdash', + 'probestack' +) # Note that Cretonne doesn't currently need an is_pie flag, because PIE is just # PIC where symbols can't be pre-empted, which can be expressed with the diff --git a/lib/codegen/src/ir/extfunc.rs b/lib/codegen/src/ir/extfunc.rs index 540083321e..95f86be8fd 100644 --- a/lib/codegen/src/ir/extfunc.rs +++ b/lib/codegen/src/ir/extfunc.rs @@ -382,7 +382,7 @@ mod tests { CallConv::Fast, CallConv::Cold, CallConv::SystemV, - CallConv::Fastcall, + CallConv::WindowsFastcall, CallConv::Baldrdash, ] { diff --git a/lib/codegen/src/isa/x86/abi.rs b/lib/codegen/src/isa/x86/abi.rs index 5ab9febb29..15b7d94654 100644 --- a/lib/codegen/src/isa/x86/abi.rs +++ b/lib/codegen/src/isa/x86/abi.rs @@ -22,6 +22,12 @@ static ARG_GPRS: [RU; 6] = [RU::rdi, RU::rsi, RU::rdx, RU::rcx, RU::r8, RU::r9]; /// Return value registers. static RET_GPRS: [RU; 3] = [RU::rax, RU::rdx, RU::rcx]; +/// Argument registers for x86-64, when using windows fastcall +static ARG_GPRS_WIN_FASTCALL_X64: [RU; 4] = [RU::rcx, RU::rdx, RU::r8, RU::r9]; + +/// Return value registers for x86-64, when using windows fastcall +static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax]; + struct Args { pointer_bytes: u32, pointer_bits: u16, @@ -36,6 +42,14 @@ struct Args { impl Args { fn new(bits: u16, gpr: &'static [RU], fpr_limit: usize, call_conv: CallConv) -> Self { + let offset = if let CallConv::WindowsFastcall = call_conv { + // [1] "The caller is responsible for allocating space for parameters to the callee, + // and must always allocate sufficient space to store four register parameters" + 32 + } else { + 0 + }; + Self { pointer_bytes: u32::from(bits) / 8, pointer_bits: bits, @@ -44,7 +58,7 @@ impl Args { gpr_used: 0, fpr_limit, fpr_used: 0, - offset: 0, + offset, call_conv, } } @@ -120,7 +134,11 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag if flags.is_64bit() { bits = 64; - args = Args::new(bits, &ARG_GPRS, 8, sig.call_conv); + args = if sig.call_conv == CallConv::WindowsFastcall { + Args::new(bits, &ARG_GPRS_WIN_FASTCALL_X64[..], 4, sig.call_conv) + } else { + Args::new(bits, &ARG_GPRS[..], 8, sig.call_conv) + }; } else { bits = 32; args = Args::new(bits, &[], 0, sig.call_conv); @@ -128,7 +146,13 @@ pub fn legalize_signature(sig: &mut ir::Signature, flags: &shared_settings::Flag legalize_args(&mut sig.params, &mut args); - let mut rets = Args::new(bits, &RET_GPRS, 2, sig.call_conv); + let regs = if sig.call_conv == CallConv::WindowsFastcall { + &RET_GPRS_WIN_FASTCALL_X64[..] + } else { + &RET_GPRS[..] + }; + + let mut rets = Args::new(bits, regs, 2, sig.call_conv); legalize_args(&mut sig.returns, &mut rets); } @@ -161,7 +185,24 @@ pub fn allocatable_registers(_func: &ir::Function, flags: &shared_settings::Flag /// Get the set of callee-saved registers. fn callee_saved_gprs(flags: &shared_settings::Flags) -> &'static [RU] { if flags.is_64bit() { - &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15] + if flags.call_conv() == CallConv::WindowsFastcall { + // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 are considered nonvolatile + // and must be saved and restored by a function that uses them." + // as per https://msdn.microsoft.com/en-us/library/6t169e9c.aspx + // RSP & RSB are not listed below, since they are restored automatically during + // a function call. If that wasn't the case, function calls (RET) would not work. + &[ + RU::rbx, + RU::rdi, + RU::rsi, + RU::r12, + RU::r13, + RU::r14, + RU::r15, + ] + } else { + &[RU::rbx, RU::r12, RU::r13, RU::r14, RU::r15] + } } else { &[RU::rbx, RU::rsi, RU::rdi] } @@ -215,7 +256,7 @@ pub fn prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::Ct CallConv::Fast | CallConv::Cold | CallConv::SystemV => { system_v_prologue_epilogue(func, isa) } - CallConv::Fastcall => unimplemented!("Windows calling conventions"), + CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa), CallConv::Baldrdash => baldrdash_prologue_epilogue(func, isa), CallConv::Probestack => unimplemented!("probestack calling convention"), } @@ -240,6 +281,83 @@ pub fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> Ok(()) } +/// Implementation of the fastcall-based Win64 calling convention described at [1] +/// [1] https://msdn.microsoft.com/en-us/library/ms235286.aspx +pub fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult { + if !isa.flags().is_64bit() { + panic!("TODO: windows-fastcall: x86-32 not implemented yet"); + } + + // [1] "The primary exceptions are the stack pointer and malloc or alloca memory, + // which are aligned to 16 bytes in order to aid performance" + let stack_align = 16; + + let word_size = if isa.flags().is_64bit() { 8 } else { 4 }; + let reg_type = if isa.flags().is_64bit() { + ir::types::I64 + } else { + ir::types::I32 + }; + + let csrs = callee_saved_gprs_used(isa.flags(), func); + + // [1] "Space is allocated on the call stack as a shadow store for callees to save" + // This shadow store contains the parameters which are passed through registers (ARG_GPRS) + // and is eventually used by the callee to save & restore the values of the arguments. + // + // [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 + // "Although the x64 calling convention reserves spill space for parameters, + // you don’t have to use them as such" + // + // The reserved stack area is composed of: + // return address + frame pointer + all callee-saved registers + shadow space + // + // Pushing the return address is an implicit function of the `call` + // instruction. Each of the others we will then push explicitly. Then we + // will adjust the stack pointer to make room for the rest of the required + // space for this frame. + const SHADOW_STORE_SIZE: i32 = 32; + let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + + // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work + // since cretonne does not support spill slots before incoming args + + func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::IncomingArg, + size: csr_stack_size as u32, + offset: Some(-(SHADOW_STORE_SIZE + csr_stack_size)), + }); + + let total_stack_size = layout_stack(&mut func.stack_slots, stack_align)? as i32; + let local_stack_size = i64::from(total_stack_size - csr_stack_size); + + // Add CSRs to function signature + let fp_arg = ir::AbiParam::special_reg( + reg_type, + ir::ArgumentPurpose::FramePointer, + RU::rbp as RegUnit, + ); + func.signature.params.push(fp_arg); + func.signature.returns.push(fp_arg); + + for csr in csrs.iter(GPR) { + let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + // Set up the cursor and insert the prologue + let entry_ebb = func.layout.entry_block().expect("missing entry block"); + let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + + // Reset the cursor and insert the epilogue + let mut pos = pos.at_position(CursorPosition::Nowhere); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs); + + Ok(()) +} + /// Insert a System V-compatible prologue and epilogue. pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> result::CtonResult { // The original 32-bit x86 ELF ABI had a 4-byte aligned stack pointer, but @@ -261,7 +379,7 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r // instruction. Each of the others we will then push explicitly. Then we // will adjust the stack pointer to make room for the rest of the required // space for this frame. - let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size as usize) as i32; + let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, @@ -289,17 +407,18 @@ pub fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &TargetIsa) -> r // Set up the cursor and insert the prologue let entry_ebb = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_ebb); - insert_system_v_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); - insert_system_v_epilogues(&mut pos, local_stack_size, reg_type, &csrs); + insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs); Ok(()) } /// Insert the prologue for a given function. -fn insert_system_v_prologue( +/// This is used by common calling conventions such as System V. +fn insert_common_prologue( pos: &mut EncCursor, stack_size: i64, reg_type: ir::types::Type, @@ -374,7 +493,7 @@ fn insert_system_v_prologue( } /// Find all `return` instructions and insert epilogues before them. -fn insert_system_v_epilogues( +fn insert_common_epilogues( pos: &mut EncCursor, stack_size: i64, reg_type: ir::types::Type, @@ -384,14 +503,15 @@ fn insert_system_v_epilogues( pos.goto_last_inst(ebb); if let Some(inst) = pos.current_inst() { if pos.func.dfg[inst].opcode().is_return() { - insert_system_v_epilogue(inst, stack_size, pos, reg_type, csrs); + insert_common_epilogue(inst, stack_size, pos, reg_type, csrs); } } } } /// Insert an epilogue given a specific `return` instruction. -fn insert_system_v_epilogue( +/// This is used by common calling conventions such as System V. +fn insert_common_epilogue( inst: ir::Inst, stack_size: i64, pos: &mut EncCursor, diff --git a/lib/native/src/lib.rs b/lib/native/src/lib.rs index f5050bea76..734c33d1f6 100644 --- a/lib/native/src/lib.rs +++ b/lib/native/src/lib.rs @@ -37,7 +37,7 @@ pub fn builders() -> Result<(settings::Builder, isa::Builder), &'static str> { if cfg!(any(unix, target_os = "nebulet")) { flag_builder.set("call_conv", "system_v").unwrap(); } else if cfg!(windows) { - flag_builder.set("call_conv", "fastcall").unwrap(); + flag_builder.set("call_conv", "windows_fastcall").unwrap(); } else { return Err("unrecognized environment"); } diff --git a/lib/simplejit/Cargo.toml b/lib/simplejit/Cargo.toml index 6c87388761..d672c1dc63 100644 --- a/lib/simplejit/Cargo.toml +++ b/lib/simplejit/Cargo.toml @@ -16,6 +16,9 @@ region = "0.2.0" libc = { version = "0.2.40", default-features = false } errno = "0.2.3" +[target.'cfg(target_os = "windows")'.dependencies] +winapi = { version = "0.3", features = ["winbase", "memoryapi"] } + [features] default = ["std"] std = ["libc/use_std", "cretonne-codegen/std", "cretonne-module/std", "cretonne-native/std"] diff --git a/lib/simplejit/src/backend.rs b/lib/simplejit/src/backend.rs index c7785e04a7..1fcfd8f93e 100644 --- a/lib/simplejit/src/backend.rs +++ b/lib/simplejit/src/backend.rs @@ -9,6 +9,8 @@ use cretonne_native; use std::ffi::CString; use std::ptr; use libc; +#[cfg(windows)] +use winapi; use memory::Memory; /// A builder for `SimpleJITBackend`. @@ -344,6 +346,7 @@ impl<'simple_jit_backend> Backend for SimpleJITBackend { fn finish(self) -> () {} } +#[cfg(not(windows))] fn lookup_with_dlsym(name: &str) -> *const u8 { let c_str = CString::new(name).unwrap(); let c_str_ptr = c_str.as_ptr(); @@ -354,6 +357,38 @@ fn lookup_with_dlsym(name: &str) -> *const u8 { sym as *const u8 } +#[cfg(windows)] +fn lookup_with_dlsym(name: &str) -> *const u8 { + const MSVCRT_DLL: &[u8] = b"msvcrt.dll\0"; + + let c_str = CString::new(name).unwrap(); + let c_str_ptr = c_str.as_ptr(); + + unsafe { + let handles = [ + // try to find the searched symbol in the currently running executable + ptr::null_mut(), + // try to find the searched symbol in local c runtime + winapi::um::libloaderapi::GetModuleHandleA(MSVCRT_DLL.as_ptr() as *const i8), + ]; + + for handle in &handles { + let addr = winapi::um::libloaderapi::GetProcAddress(*handle, c_str_ptr); + if addr.is_null() { + continue; + } + return addr as *const u8; + } + + let msg = if handles[1].is_null() { + "(msvcrt not loaded)" + } else { + "" + }; + panic!("cannot resolve address of symbol {} {}", name, msg); + } +} + struct SimpleJITRelocSink { pub relocs: Vec, } diff --git a/lib/simplejit/src/lib.rs b/lib/simplejit/src/lib.rs index 68a2ca00ea..92c088cd71 100644 --- a/lib/simplejit/src/lib.rs +++ b/lib/simplejit/src/lib.rs @@ -23,6 +23,9 @@ extern crate errno; extern crate region; extern crate libc; +#[cfg(target_os = "windows")] +extern crate winapi; + mod backend; mod memory; diff --git a/lib/simplejit/src/memory.rs b/lib/simplejit/src/memory.rs index 2158352cc8..3ec2995372 100644 --- a/lib/simplejit/src/memory.rs +++ b/lib/simplejit/src/memory.rs @@ -26,6 +26,7 @@ impl PtrLen { /// Create a new `PtrLen` pointing to at least `size` bytes of memory, /// suitably sized and aligned for memory protection. + #[cfg(not(target_os = "windows"))] fn with_size(size: usize) -> Result { let page_size = region::page::size(); let alloc_size = round_up_to_page_size(size, page_size); @@ -42,6 +43,32 @@ impl PtrLen { } } } + + #[cfg(target_os = "windows")] + fn with_size(size: usize) -> Result { + use winapi::um::memoryapi::VirtualAlloc; + use winapi::um::winnt::{MEM_COMMIT, MEM_RESERVE, PAGE_READWRITE}; + + let page_size = region::page::size(); + + // VirtualAlloc always rounds up to the next multiple of the page size + let ptr = unsafe { + VirtualAlloc( + ptr::null_mut(), + size, + MEM_COMMIT | MEM_RESERVE, + PAGE_READWRITE, + ) + }; + if !ptr.is_null() { + Ok(Self { + ptr: ptr as *mut u8, + len: round_up_to_page_size(size, page_size), + }) + } else { + Err(errno::errno().to_string()) + } + } } /// JIT memory manager. This manages pages of suitably aligned and