diff --git a/src/backend.rs b/src/backend.rs index e74c85af9b..bbf1abe663 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -10,6 +10,7 @@ const WORD_SIZE: u32 = 8; type GPR = u8; +#[derive(Copy, Clone)] struct GPRs { bits: u16, } @@ -36,13 +37,19 @@ const R12: u8 = 12; const R13: u8 = 13; const R14: u8 = 14; const R15: u8 = 15; +const NUM_GPRS: u8 = 16; impl GPRs { fn take(&mut self) -> GPR { let lz = self.bits.trailing_zeros(); - assert!(lz < 32, "ran out of free GPRs"); - self.bits &= !(1 << lz); - lz as GPR + assert!(lz < 16, "ran out of free GPRs"); + let gpr = lz as GPR; + self.mark_used(gpr); + gpr + } + + fn mark_used(&mut self, gpr: GPR) { + self.bits &= !(1 << gpr as u16); } fn release(&mut self, gpr: GPR) { @@ -50,62 +57,80 @@ impl GPRs { self.bits |= 1 << gpr; } + fn free_count(&self) -> u32 { + self.bits.count_ones() + } + fn is_free(&self, gpr: GPR) -> bool { (self.bits & (1 << gpr)) != 0 } } +#[derive(Copy, Clone)] pub struct Registers { - scratch_gprs: GPRs, + scratch: GPRs, +} + +impl Default for Registers { + fn default() -> Self { + Self::new() + } } impl Registers { pub fn new() -> Self { let mut result = Self { - scratch_gprs: GPRs::new(), + scratch: GPRs::new(), }; // Give ourselves a few scratch registers to work with, for now. - result.release_scratch_gpr(RAX); - result.release_scratch_gpr(RCX); - result.release_scratch_gpr(RDX); + for &scratch in SCRATCH_REGS { + result.release_scratch_gpr(scratch); + } + result } + // TODO: Add function that takes a scratch register if possible + // but otherwise gives a fresh stack location. pub fn take_scratch_gpr(&mut self) -> GPR { - self.scratch_gprs.take() + self.scratch.take() } pub fn release_scratch_gpr(&mut self, gpr: GPR) { - self.scratch_gprs.release(gpr); + self.scratch.release(gpr); + } + + pub fn is_free(&self, gpr: GPR) -> bool { + self.scratch.is_free(gpr) + } + + pub fn free_scratch(&self) -> u32 { + self.scratch.free_count() } } -/// Describes location of a argument. -#[derive(Debug)] -enum ArgLocation { - /// Argument is passed via some register. +/// Describes location of a value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ValueLocation { + /// Value exists in a register. Reg(GPR), - /// Value is passed thru the stack. + /// Value exists on the stack. This is an offset relative to the + /// first local, and so will have to be adjusted with `adjusted_offset` + /// before reading (as RSP may have been changed by `push`/`pop`). Stack(i32), } // TODO: This assumes only system-v calling convention. // In system-v calling convention the first 6 arguments are passed via registers. // All rest arguments are passed on the stack. -const ARGS_IN_GPRS: &'static [GPR] = &[RDI, RSI, RDX, RCX, R8, R9]; - -/// Get a location for an argument at the given position. -fn abi_loc_for_arg(pos: u32) -> ArgLocation { - if let Some(®) = ARGS_IN_GPRS.get(pos as usize) { - ArgLocation::Reg(reg) - } else { - let stack_pos = pos - ARGS_IN_GPRS.len() as u32; - // +2 is because the first argument is located right after the saved frame pointer slot - // and the incoming return address. - let stack_offset = ((stack_pos + 2) * WORD_SIZE) as i32; - ArgLocation::Stack(stack_offset) - } -} +const ARGS_IN_GPRS: &[GPR] = &[RDI, RSI, RDX, RCX, R8, R9]; +// RAX is reserved for return values. In the future we want a system to allow +// use of specific registers by saving/restoring them. This would allow using +// RAX as a scratch register when we're not calling a function, and would also +// allow us to call instructions that require specific registers. +// +// List of scratch registers taken from https://wiki.osdev.org/System_V_ABI +const SCRATCH_REGS: &[GPR] = &[R10, R11]; pub struct CodeGenSession { assembler: Assembler, @@ -138,8 +163,8 @@ impl CodeGenSession { Context { asm: &mut self.assembler, func_starts: &self.func_starts, - regs: Registers::new(), - sp_depth: StackDepth(0), + block_state: Default::default(), + locals: Default::default(), } } @@ -177,14 +202,78 @@ impl TranslatedCodeSection { } } +// TODO: Immediates? We could implement on-the-fly const folding +#[derive(Copy, Clone)] +enum Value { + Local(u32), + Temp(GPR), +} + +impl Value { + fn location(&self, locals: &Locals) -> ValueLocation { + match *self { + Value::Local(loc) => local_location(locals, loc), + Value::Temp(reg) => ValueLocation::Reg(reg), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum StackValue { + Local(u32), + Temp(GPR), + Pop, +} + +impl StackValue { + fn location(&self, locals: &Locals) -> Option { + match *self { + StackValue::Local(loc) => Some(local_location(locals, loc)), + StackValue::Temp(reg) => Some(ValueLocation::Reg(reg)), + StackValue::Pop => None, + } + } +} + +#[derive(Default)] +struct Locals { + // TODO: Use `ArrayVec` since we have a hard maximum (the number of registers) + locs: Vec, +} + +#[derive(Default, Clone)] +pub struct BlockState { + stack: Stack, + depth: StackDepth, + regs: Registers, +} + +fn adjusted_offset(ctx: &mut Context, offset: i32) -> i32 { + (ctx.block_state.depth.0 * WORD_SIZE) as i32 + offset +} + +fn local_location(locals: &Locals, index: u32) -> ValueLocation { + locals + .locs + .get(index as usize) + .cloned() + .unwrap_or(ValueLocation::Stack( + (index.saturating_sub(ARGS_IN_GPRS.len() as u32) * WORD_SIZE) as _, + )) +} + +type Stack = Vec; + pub struct Context<'a> { asm: &'a mut Assembler, func_starts: &'a Vec<(Option, DynamicLabel)>, - regs: Registers, /// Each push and pop on the value stack increments or decrements this value by 1 respectively. - sp_depth: StackDepth, + block_state: BlockState, + locals: Locals, } +impl<'a> Context<'a> {} + /// Label in code. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct Label(DynamicLabel); @@ -203,7 +292,7 @@ pub fn define_label(ctx: &mut Context, label: Label) { } /// Offset from starting value of SP counted in words. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] pub struct StackDepth(u32); impl StackDepth { @@ -216,146 +305,298 @@ impl StackDepth { } } -pub fn current_stack_depth(ctx: &Context) -> StackDepth { - ctx.sp_depth +pub fn current_block_state(ctx: &Context) -> BlockState { + ctx.block_state.clone() } -pub fn restore_stack_depth(ctx: &mut Context, stack_depth: StackDepth) { - ctx.sp_depth = stack_depth; +pub fn restore_block_state(ctx: &mut Context, block_state: BlockState) { + ctx.block_state = block_state; } -fn push_i32(ctx: &mut Context, gpr: GPR) { - // For now, do an actual push (and pop below). In the future, we could - // do on-the-fly register allocation here. - ctx.sp_depth.reserve(1); - dynasm!(ctx.asm - ; push Rq(gpr) - ); - ctx.regs.release_scratch_gpr(gpr); +pub fn push_return_value(ctx: &mut Context) { + ctx.block_state.stack.push(StackValue::Temp(RAX)); } -fn pop_i32(ctx: &mut Context) -> GPR { - ctx.sp_depth.free(1); - let gpr = ctx.regs.take_scratch_gpr(); - dynasm!(ctx.asm - ; pop Rq(gpr) - ); - gpr +fn push_i32(ctx: &mut Context, value: Value) { + let stack_loc = match value { + Value::Local(loc) => StackValue::Local(loc), + Value::Temp(gpr) => { + if ctx.block_state.regs.free_scratch() >= 1 { + StackValue::Temp(gpr) + } else { + ctx.block_state.depth.reserve(1); + dynasm!(ctx.asm + ; push Rq(gpr) + ); + ctx.block_state.regs.release_scratch_gpr(gpr); + StackValue::Pop + } + } + }; + + ctx.block_state.stack.push(stack_loc); } +fn pop_i32(ctx: &mut Context) -> Value { + match ctx.block_state.stack.pop().expect("Stack is empty") { + StackValue::Local(loc) => Value::Local(loc), + StackValue::Temp(reg) => Value::Temp(reg), + StackValue::Pop => { + ctx.block_state.depth.free(1); + let gpr = ctx.block_state.regs.take_scratch_gpr(); + dynasm!(ctx.asm + ; pop Rq(gpr) + ); + Value::Temp(gpr) + } + } +} + +fn pop_i32_into(ctx: &mut Context, dst: ValueLocation) { + let val = pop_i32(ctx); + let val_loc = val.location(&ctx.locals); + copy_value(ctx, val_loc, dst); + free_val(ctx, val); +} + +fn free_val(ctx: &mut Context, val: Value) { + match val { + Value::Temp(reg) => ctx.block_state.regs.release_scratch_gpr(reg), + Value::Local(_) => {} + } +} + +/// Puts this value into a register so that it can be efficiently read +fn into_reg(ctx: &mut Context, val: Value) -> GPR { + match val.location(&ctx.locals) { + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + let scratch = ctx.block_state.regs.take_scratch_gpr(); + dynasm!(ctx.asm + ; mov Rq(scratch), [rsp + offset] + ); + scratch + } + ValueLocation::Reg(reg) => reg, + } +} + +/// Puts this value into a temporary register so that operations +/// on that register don't write to a local. +fn into_temp_reg(ctx: &mut Context, val: Value) -> GPR { + match val { + Value::Local(loc) => { + let scratch = ctx.block_state.regs.take_scratch_gpr(); + + match local_location(&ctx.locals, loc) { + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; mov Rq(scratch), [rsp + offset] + ); + } + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; mov Rq(scratch), Rq(reg) + ); + } + } + + scratch + } + Value::Temp(reg) => reg, + } +} + +// TODO: For the commutative instructions we can do operands in either +// order, so we can choose the operand order that creates the +// least unnecessary temps. pub fn i32_add(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; add Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; add Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; add Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn i32_sub(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; sub Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; sub Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; sub Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn i32_and(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; and Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; and Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; and Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn i32_or(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; or Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; or Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; or Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn i32_xor(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; xor Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; xor Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; xor Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn i32_mul(ctx: &mut Context) { let op0 = pop_i32(ctx); - let op1 = pop_i32(ctx); - dynasm!(ctx.asm - ; imul Rd(op1), Rd(op0) - ); - push_i32(ctx, op1); - ctx.regs.release_scratch_gpr(op0); -} - -fn sp_relative_offset(ctx: &mut Context, slot_idx: u32) -> i32 { - ((ctx.sp_depth.0 as i32) + slot_idx as i32) * WORD_SIZE as i32 + let tmp = pop_i32(ctx); + let op1 = into_temp_reg(ctx, tmp); + match op0.location(&ctx.locals) { + ValueLocation::Reg(reg) => { + dynasm!(ctx.asm + ; imul Rd(op1), Rd(reg) + ); + } + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; imul Rd(op1), [rsp + offset] + ); + } + } + ctx.block_state.stack.push(StackValue::Temp(op1)); + free_val(ctx, op0); } pub fn get_local_i32(ctx: &mut Context, local_idx: u32) { - let gpr = ctx.regs.take_scratch_gpr(); - let offset = sp_relative_offset(ctx, local_idx); - dynasm!(ctx.asm - ; mov Rq(gpr), [rsp + offset] - ); - push_i32(ctx, gpr); + push_i32(ctx, Value::Local(local_idx)); } +// TODO: We can put locals that were spilled to the stack +// back into registers here. pub fn set_local_i32(ctx: &mut Context, local_idx: u32) { - let gpr = pop_i32(ctx); - let offset = sp_relative_offset(ctx, local_idx); - dynasm!(ctx.asm - ; mov [rsp + offset], Rq(gpr) - ); - ctx.regs.release_scratch_gpr(gpr); + let val = pop_i32(ctx); + let val_loc = val.location(&ctx.locals); + let dst_loc = local_location(&ctx.locals, local_idx); + copy_value(ctx, val_loc, dst_loc); + free_val(ctx, val); } +// TODO: Don't store literals at all, roll them into `Value` pub fn literal_i32(ctx: &mut Context, imm: i32) { - let gpr = ctx.regs.take_scratch_gpr(); + let gpr = ctx.block_state.regs.take_scratch_gpr(); dynasm!(ctx.asm ; mov Rd(gpr), imm ); - push_i32(ctx, gpr); + push_i32(ctx, Value::Temp(gpr)); } pub fn relop_eq_i32(ctx: &mut Context) { let right = pop_i32(ctx); let left = pop_i32(ctx); - let result = ctx.regs.take_scratch_gpr(); - dynasm!(ctx.asm - ; xor Rq(result), Rq(result) - ; cmp Rd(left), Rd(right) - ; sete Rb(result) - ); - push_i32(ctx, result); - ctx.regs.release_scratch_gpr(left); - ctx.regs.release_scratch_gpr(right); + let result = ctx.block_state.regs.take_scratch_gpr(); + let lreg = into_reg(ctx, left); + match right.location(&ctx.locals) { + ValueLocation::Stack(offset) => { + let offset = adjusted_offset(ctx, offset); + dynasm!(ctx.asm + ; xor Rq(result), Rq(result) + ; cmp Rd(lreg), [rsp + offset] + ; sete Rb(result) + ); + } + ValueLocation::Reg(rreg) => { + dynasm!(ctx.asm + ; xor Rq(result), Rq(result) + ; cmp Rd(lreg), Rd(rreg) + ; sete Rb(result) + ); + } + } + push_i32(ctx, Value::Temp(result)); + free_val(ctx, left); + free_val(ctx, right); } /// Pops i32 predicate and branches to the specified label /// if the predicate is equal to zero. pub fn pop_and_breq(ctx: &mut Context, label: Label) { - let predicate = pop_i32(ctx); + let val = pop_i32(ctx); + let predicate = into_temp_reg(ctx, val); dynasm!(ctx.asm ; test Rd(predicate), Rd(predicate) ; je =>label.0 ); - ctx.regs.release_scratch_gpr(predicate); + ctx.block_state.regs.release_scratch_gpr(predicate); } /// Branch unconditionally to the specified label. @@ -366,122 +607,246 @@ pub fn br(ctx: &mut Context, label: Label) { } pub fn prepare_return_value(ctx: &mut Context) { - let ret_gpr = pop_i32(ctx); - if ret_gpr != RAX { - dynasm!(ctx.asm - ; mov Rq(RAX), Rq(ret_gpr) - ); - ctx.regs.release_scratch_gpr(ret_gpr); - } + pop_i32_into(ctx, ValueLocation::Reg(RAX)); } -pub fn copy_incoming_arg(ctx: &mut Context, frame_size: u32, arg_pos: u32) { - let loc = abi_loc_for_arg(arg_pos); - - // First, ensure the argument is in a register. - let reg = match loc { - ArgLocation::Reg(reg) => reg, - ArgLocation::Stack(offset) => { - assert!( - ctx.regs.scratch_gprs.is_free(RAX), - "we assume that RAX can be used as a scratch register for now", - ); - let offset = offset + (frame_size * WORD_SIZE) as i32; - dynasm!(ctx.asm - ; mov Rq(RAX), [rsp + offset] - ); - RAX +fn copy_value(ctx: &mut Context, src: ValueLocation, dst: ValueLocation) { + match (src, dst) { + (ValueLocation::Stack(in_offset), ValueLocation::Stack(out_offset)) => { + let in_offset = adjusted_offset(ctx, in_offset); + let out_offset = adjusted_offset(ctx, out_offset); + if in_offset != out_offset { + let gpr = ctx.block_state.regs.take_scratch_gpr(); + dynasm!(ctx.asm + ; mov Rq(gpr), [rsp + in_offset] + ; mov [rsp + out_offset], Rq(gpr) + ); + ctx.block_state.regs.release_scratch_gpr(gpr); + } } - }; - - // And then move a value from a register into local variable area on the stack. - let offset = sp_relative_offset(ctx, arg_pos); - dynasm!(ctx.asm - ; mov [rsp + offset], Rq(reg) - ); + (ValueLocation::Reg(in_reg), ValueLocation::Stack(out_offset)) => { + let out_offset = adjusted_offset(ctx, out_offset); + dynasm!(ctx.asm + ; mov [rsp + out_offset], Rq(in_reg) + ); + } + (ValueLocation::Stack(in_offset), ValueLocation::Reg(out_reg)) => { + let in_offset = adjusted_offset(ctx, in_offset); + dynasm!(ctx.asm + ; mov Rq(out_reg), [rsp + in_offset] + ); + } + (ValueLocation::Reg(in_reg), ValueLocation::Reg(out_reg)) => { + if in_reg != out_reg { + dynasm!(ctx.asm + ; mov Rq(out_reg), Rq(in_reg) + ); + } + } + } } #[must_use] -fn pass_outgoing_args(ctx: &mut Context, arity: u32) -> i32 { - let mut stack_args = Vec::with_capacity((arity as usize).saturating_sub(ARGS_IN_GPRS.len())); - for arg_pos in (0..arity).rev() { - ctx.sp_depth.free(1); +pub struct CallCleanup { + restore_registers: Vec, + stack_depth: i32, +} - let loc = abi_loc_for_arg(arg_pos); - match loc { - ArgLocation::Reg(gpr) => { - dynasm!(ctx.asm - ; pop Rq(gpr) - ); +/// Make sure that any argument registers that will be used by the call are free +/// by storing them to the stack. +/// +/// Unfortunately, we can't elide this store if we're just passing arguments on +/// because these registers are caller-saved and so the callee can use them as +/// scratch space. +fn free_arg_registers(ctx: &mut Context, count: u32) { + if count == 0 { + return; + } + + for i in 0..ctx.locals.locs.len() { + match ctx.locals.locs[i] { + ValueLocation::Reg(reg) => { + if ARGS_IN_GPRS.contains(®) { + let offset = adjusted_offset(ctx, (i as u32 * WORD_SIZE) as _); + dynasm!(ctx.asm + ; mov [rsp + offset], Rq(reg) + ); + ctx.locals.locs[i] = ValueLocation::Stack(offset); + } } - ArgLocation::Stack(_) => { - let gpr = ctx.regs.take_scratch_gpr(); + _ => {} + } + } +} + +fn free_return_register(ctx: &mut Context, count: u32) { + if count == 0 { + return; + } + + for stack_val in &mut ctx.block_state.stack { + match stack_val.location(&ctx.locals) { + // For now it's impossible for a local to be in RAX but that might be + // possible in the future, so we check both cases. + Some(ValueLocation::Reg(RAX)) => { + let scratch = ctx.block_state.regs.take_scratch_gpr(); dynasm!(ctx.asm - ; pop Rq(gpr) + ; mov Rq(scratch), rax ); - stack_args.push(gpr); + *stack_val = StackValue::Temp(scratch); } + _ => {} + } + } +} + +// TODO: Use `ArrayVec`? +/// Saves volatile (i.e. caller-saved) registers before a function call, if they are used. +fn save_volatile(ctx: &mut Context) -> Vec { + let mut out = vec![]; + + // TODO: If there are no `StackValue::Pop`s that need to be popped + // before we reach our `Temp` value, we can set the `StackValue` + // for the register to be restored to `StackValue::Pop` (and + // release the register!) instead of restoring it. + for ® in SCRATCH_REGS.iter() { + if !ctx.block_state.regs.is_free(reg) { + dynasm!(ctx.asm + ; push Rq(reg) + ); + out.push(reg); } } - let num_stack_args = stack_args.len() as i32; - dynasm!(ctx.asm - ; sub rsp, num_stack_args - ); - for (stack_slot, gpr) in stack_args.into_iter().rev().enumerate() { - let offset = (stack_slot * WORD_SIZE as usize) as i32; + out +} + +/// Write the arguments to the callee to the registers and the stack using the SystemV +/// calling convention. +fn pass_outgoing_args(ctx: &mut Context, arity: u32) -> CallCleanup { + let num_stack_args = (arity as usize).saturating_sub(ARGS_IN_GPRS.len()) as i32; + + let out = CallCleanup { + stack_depth: num_stack_args, + restore_registers: save_volatile(ctx), + }; + + // We pop stack arguments first - arguments are RTL + if num_stack_args > 0 { + let size = num_stack_args * WORD_SIZE as i32; + + // Reserve space for the outgoing stack arguments (so we don't + // stomp on any locals or the value stack). dynasm!(ctx.asm - ; mov [rsp + offset], Rq(gpr) + ; sub rsp, size ); - ctx.regs.release_scratch_gpr(gpr); + ctx.block_state.depth.reserve(num_stack_args as u32); + + for stack_slot in (0..num_stack_args).rev() { + // Since the stack offset is from the bottom of the locals + // and we want to start from the actual RSP (so `offset = 0` + // writes to `[rsp]`), we subtract our current depth. + // + // We might want to do this in the future by having a separate + // `AbsoluteValueLocation` and `RelativeValueLocation`. + let offset = + stack_slot * WORD_SIZE as i32 - ctx.block_state.depth.0 as i32 * WORD_SIZE as i32; + pop_i32_into(ctx, ValueLocation::Stack(offset)); + } } - num_stack_args + for reg in ARGS_IN_GPRS[..(arity as usize).min(ARGS_IN_GPRS.len())] + .iter() + .rev() + { + pop_i32_into(ctx, ValueLocation::Reg(*reg)); + } + + out } -fn post_call_cleanup(ctx: &mut Context, num_stack_args: i32) { - dynasm!(ctx.asm - ; add rsp, num_stack_args - ); +/// Frees up the stack space used for stack-passed arguments and restores the value +/// of volatile (i.e. caller-saved) registers to the state that they were in before +/// the call. +fn post_call_cleanup(ctx: &mut Context, mut cleanup: CallCleanup) { + if cleanup.stack_depth > 0 { + let size = cleanup.stack_depth * WORD_SIZE as i32; + dynasm!(ctx.asm + ; add rsp, size + ); + } + + for reg in cleanup.restore_registers.drain(..).rev() { + dynasm!(ctx.asm + ; pop Rq(reg) + ); + } } +/// Call a function with the given index pub fn call_direct(ctx: &mut Context, index: u32, arg_arity: u32, return_arity: u32) { - assert!(return_arity == 0 || return_arity == 1); + assert!( + return_arity == 0 || return_arity == 1, + "We don't support multiple return yet" + ); - let num_stack_args = pass_outgoing_args(ctx, arg_arity); + free_arg_registers(ctx, arg_arity); + free_return_register(ctx, return_arity); + + let cleanup = pass_outgoing_args(ctx, arg_arity); let label = &ctx.func_starts[index as usize].1; dynasm!(ctx.asm ; call =>*label ); - post_call_cleanup(ctx, num_stack_args); - - if return_arity == 1 { - dynasm!(ctx.asm - ; push rax - ); - ctx.sp_depth.reserve(1); - } + post_call_cleanup(ctx, cleanup); } -pub fn prologue(ctx: &mut Context, stack_slots: u32) { - let stack_slots = stack_slots; +// TODO: Reserve space to store RBX, RBP, and R12..R15 so we can use them +// as scratch registers +// TODO: Allow use of unused argument registers as scratch registers. +/// Writes the function prologue and stores the arguments as locals +pub fn start_function(ctx: &mut Context, arguments: u32, locals: u32) { + let reg_args = &ARGS_IN_GPRS[..(arguments as usize).min(ARGS_IN_GPRS.len())]; + + // We need space to store the register arguments if we need to call a function + // and overwrite these registers so we add `reg_args.len()` + let locals = locals + reg_args.len() as u32; // Align stack slots to the nearest even number. This is required // by x86-64 ABI. - let aligned_stack_slots = (stack_slots + 1) & !1; - + let aligned_stack_slots = (locals + 1) & !1; let framesize: i32 = aligned_stack_slots as i32 * WORD_SIZE as i32; + + ctx.locals.locs = reg_args + .iter() + .cloned() + .map(ValueLocation::Reg) + .chain( + (0..arguments.saturating_sub(ARGS_IN_GPRS.len() as _)) + // We add 2 here because 1 stack slot is used for the stack pointer and another is + // used for the return address. It's a magic number but there's not really a way + // around this. + .map(|arg_i| ValueLocation::Stack(((arg_i + 2) * WORD_SIZE) as i32 + framesize)), + ) + .collect(); + dynasm!(ctx.asm ; push rbp ; mov rbp, rsp - ; sub rsp, framesize ); - ctx.sp_depth.reserve(aligned_stack_slots - stack_slots); + + if framesize > 0 { + dynasm!(ctx.asm + ; sub rsp, framesize + ); + } } +/// Writes the function epilogue, restoring the stack pointer and returning to the +/// caller. pub fn epilogue(ctx: &mut Context) { - // We don't need to clean up the stack - `rsp` is restored and + // We don't need to clean up the stack - RSP is restored and // the calling function has its own register stack and will // stomp on the registers from our stack if necessary. dynasm!(ctx.asm diff --git a/src/function_body.rs b/src/function_body.rs index e27c8a3bfc..1ccce16f9c 100644 --- a/src/function_body.rs +++ b/src/function_body.rs @@ -56,31 +56,22 @@ struct ControlFrame { /// becomes polymorphic only after an instruction that never passes control further is executed, /// i.e. `unreachable`, `br` (but not `br_if`!), etc. stack_polymorphic: bool, - /// Relative stack depth at the beginning of the frame. - stack_depth: StackDepth, + /// State specific to the block (free temp registers, stack etc) which should be replaced + /// at the end of the block + block_state: BlockState, ty: Type, } impl ControlFrame { - pub fn new(kind: ControlFrameKind, stack_depth: StackDepth, ty: Type) -> ControlFrame { + pub fn new(kind: ControlFrameKind, block_state: BlockState, ty: Type) -> ControlFrame { ControlFrame { kind, - stack_depth, + block_state, ty, stack_polymorphic: false, } } - pub fn outgoing_stack_depth(&self) -> StackDepth { - let mut outgoing_stack_depth = self.stack_depth; - if self.ty != Type::EmptyBlockType { - // If there a return value then reserve expected outgoing stack depth value - // to account for the result value. - outgoing_stack_depth.reserve(1); - } - outgoing_stack_depth - } - /// Marks this control frame as reached stack-polymorphic state. pub fn mark_stack_polymorphic(&mut self) { self.stack_polymorphic = true; @@ -103,20 +94,16 @@ pub fn translate( Type::EmptyBlockType }; - let mut framesize = arg_count; + let mut num_locals = 0; for local in locals { let (count, _ty) = local?; - framesize += count; + num_locals += count; } let mut ctx = session.new_context(func_idx); let operators = body.get_operators_reader()?; - prologue(&mut ctx, framesize); - - for arg_pos in 0..arg_count { - copy_incoming_arg(&mut ctx, framesize, arg_pos); - } + start_function(&mut ctx, arg_count, num_locals); let mut control_frames = Vec::new(); @@ -127,7 +114,7 @@ pub fn translate( ControlFrameKind::Block { end_label: epilogue_label, }, - current_stack_depth(&ctx), + current_block_state(&ctx), return_ty, )); @@ -148,7 +135,7 @@ pub fn translate( control_frames.push(ControlFrame::new( ControlFrameKind::IfTrue { end_label, if_not }, - current_stack_depth(&ctx), + current_block_state(&ctx), ty, )); } @@ -157,7 +144,7 @@ pub fn translate( Some(ControlFrame { kind: ControlFrameKind::IfTrue { if_not, end_label }, ty, - stack_depth, + block_state, .. }) => { // Finalize if..else block by jumping to the `end_label`. @@ -167,7 +154,7 @@ pub fn translate( // 0 it will branch here. // After that reset stack depth to the value before entering `if` block. define_label(&mut ctx, if_not); - restore_stack_depth(&mut ctx, stack_depth); + restore_block_state(&mut ctx, block_state.clone()); // Carry over the `end_label`, so it will be resolved when the corresponding `end` // is encountered. @@ -175,7 +162,7 @@ pub fn translate( // Also note that we reset `stack_depth` to the value before entering `if` block. let mut frame = ControlFrame::new( ControlFrameKind::IfFalse { end_label }, - stack_depth, + block_state, ty, ); control_frames.push(frame); @@ -199,14 +186,12 @@ pub fn translate( define_label(&mut ctx, if_not); } - restore_stack_depth(&mut ctx, control_frame.outgoing_stack_depth()); - - if control_frames.len() == 0 { - // This is the last control frame. Perform the implicit return here. - if return_ty != Type::EmptyBlockType { - prepare_return_value(&mut ctx); - } + // This is the last control frame. Perform the implicit return here. + if control_frames.len() == 0 && return_ty != Type::EmptyBlockType { + prepare_return_value(&mut ctx); } + + // restore_block_state(&mut ctx, control_frame.block_state); } Operator::I32Eq => relop_eq_i32(&mut ctx), Operator::I32Add => i32_add(&mut ctx), @@ -228,6 +213,7 @@ pub fn translate( callee_ty.params.len() as u32, callee_ty.returns.len() as u32, ); + push_return_value(&mut ctx); } _ => { trap(&mut ctx); diff --git a/src/lib.rs b/src/lib.rs index 71fecec427..ea1a4697d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,10 @@ extern crate wasmparser; #[macro_use] extern crate failure_derive; extern crate dynasmrt; +#[cfg(test)] #[macro_use] extern crate lazy_static; +#[cfg(test)] #[macro_use] extern crate quickcheck; extern crate wabt; diff --git a/src/tests.rs b/src/tests.rs index 7cf63eee11..df593a0911 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -201,7 +201,9 @@ fn function_read_args_spill_to_stack() { assert_eq!( { let translated = translate_wat(code); - let out: u32 = unsafe { translated.execute_func(0, (7, 6, 5, 4, 3, 2, 1, 0)) }; + let out: u32 = unsafe { + translated.execute_func(0, (7u32, 6u32, 5u32, 4u32, 3u32, 2u32, 1u32, 0u32)) + }; out }, 7 @@ -213,6 +215,7 @@ fn function_write_args_spill_to_stack() { let code = r#" (module (func (param i32) (param i32) (param i32) (param i32) + (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (result i32) @@ -225,16 +228,21 @@ fn function_write_args_spill_to_stack() { (get_local 5) (get_local 6) (get_local 7) + (get_local 8) + (get_local 9) + (get_local 10) + (get_local 11) ) ) (func $called + (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (param i32) (result i32) (call $assert_zero - (get_local 7) + (get_local 11) ) (get_local 0) ) @@ -251,10 +259,10 @@ fn function_write_args_spill_to_stack() { assert_eq!( { let translated = translate_wat(code); - let out: u32 = unsafe { translated.execute_func(0, (7, 6, 5, 4, 3, 2, 1, 0)) }; + let out: u32 = unsafe { translated.execute_func(0, (11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) }; out }, - 7 + 11 ); } #[test]