Merge ends, store locals in registers where possible

This commit is contained in:
Jef
2019-01-17 11:07:51 +01:00
parent a7fa7da7d4
commit 74d168ec4b
6 changed files with 155 additions and 50 deletions

View File

@@ -13,6 +13,8 @@ arrayvec = "0.4"
dynasm = "0.2.3"
dynasmrt = "0.2.3"
wasmparser = { path = "./wasmparser.rs" }
memoffset = "0.2"
itertools = "0.8"
capstone = "0.5.0"
failure = "0.1.3"
failure_derive = "0.1.3"

View File

@@ -384,7 +384,7 @@ pub struct Locals {
/// registers that this can contain. If we need to move the argument
/// out of a register (for example, because we're calling a function)
/// we note that down here, so we don't have to move it back afterwards.
register_arguments: ArrayVec<[ArgLoc; ARGS_IN_GPRS.len()]>,
register_locals: ArrayVec<[ArgLoc; ARGS_IN_GPRS.len()]>,
/// The number of arguments stored on the stack.
num_stack_args: u32,
/// The number of local stack slots, i.e. the amount of stack space reserved for locals.
@@ -393,7 +393,7 @@ pub struct Locals {
impl Locals {
fn register(&self, index: u32) -> Option<GPR> {
if index < self.register_arguments.len() as u32 {
if index < self.register_locals.len() as u32 {
Some(ARGS_IN_GPRS[index as usize])
} else {
None
@@ -401,19 +401,19 @@ impl Locals {
}
fn add_pos(&mut self, index: u32, loc: ValueLocation) {
self.register_arguments[index as usize].add_loc(loc);
self.register_locals[index as usize].add_loc(loc);
}
fn set_pos(&mut self, index: u32, loc: ValueLocation) {
self.register_arguments[index as usize] = ArgLoc::from_loc(loc);
self.register_locals[index as usize] = ArgLoc::from_loc(loc);
}
fn get(&self, index: u32) -> ValueLocation {
self.register_arguments
self.register_locals
.get(index as usize)
.map(ArgLoc::best_loc)
.unwrap_or_else(|| {
let stack_index = index - self.register_arguments.len() as u32;
let stack_index = index - self.register_locals.len() as u32;
if stack_index < self.num_stack_args {
ValueLocation::Stack(
((stack_index + self.num_local_stack_slots + 2) * WORD_SIZE) as _,
@@ -426,7 +426,7 @@ impl Locals {
}
fn num_args(&self) -> u32 {
self.register_arguments.len() as u32 + self.num_stack_args
self.register_locals.len() as u32 + self.num_stack_args
}
fn vmctx_index(&self) -> u32 {
@@ -1245,20 +1245,20 @@ impl Context<'_> {
parent_block_state: BlockState,
before_push_return: impl FnOnce(&mut Self),
) {
// TODO: This should currently never be called, but is important for if we want to
// have a more complex stack spilling scheme.
debug_assert_eq!(
self.block_state.depth, parent_block_state.depth,
"Imbalanced pushes and pops"
);
// TODO: This should currently never be called, but is important for if we want to
// have a more complex stack spilling scheme.
// TODO: This should use an `end_locals`-style system where we only do this when
// control flow splits.
if self.block_state.depth != parent_block_state.depth {
dynasm!(self.asm
; add rsp, ((self.block_state.depth.0 - parent_block_state.depth.0) * WORD_SIZE) as i32
);
}
self.restore_locals();
let return_reg = self.block_state.return_register;
let locals = mem::replace(&mut self.block_state.locals, Default::default());
self.block_state = parent_block_state;
@@ -1282,10 +1282,10 @@ impl Context<'_> {
for (src, dst) in self
.block_state
.locals
.register_arguments
.register_locals
.clone()
.iter()
.zip(&locals.register_arguments)
.zip(&locals.register_locals)
{
self.copy_value(src.best_loc(), dst.best_loc());
}
@@ -1293,9 +1293,9 @@ impl Context<'_> {
for (src, dst) in self
.block_state
.locals
.register_arguments
.register_locals
.iter_mut()
.zip(&locals.register_arguments)
.zip(&locals.register_locals)
{
src.union(*dst);
}
@@ -1715,7 +1715,7 @@ impl Context<'_> {
if let Some(cur) = self
.block_state
.locals
.register_arguments
.register_locals
.get_mut(local_idx as usize)
{
*cur = ArgLoc::from_loc(dst_loc);
@@ -1736,7 +1736,7 @@ impl Context<'_> {
if let Some(cur) = self
.block_state
.locals
.register_arguments
.register_locals
.get_mut(local_idx as usize)
{
*cur = ArgLoc::from_loc(dst_loc);
@@ -1821,10 +1821,10 @@ impl Context<'_> {
fn free_arg_registers(&mut self, exclude: Option<u32>) {
// This is bound to the maximum size of the `ArrayVec` amd so can be considered to have constant
// runtime
for i in (0..self.block_state.locals.register_arguments.len())
for i in (0..self.block_state.locals.register_locals.len())
.filter(|i| exclude != Some(*i as u32))
{
match self.block_state.locals.register_arguments[i] {
match self.block_state.locals.register_locals[i] {
ArgLoc::Register(reg) => {
if ARGS_IN_GPRS.contains(&reg) {
let offset =
@@ -1832,7 +1832,7 @@ impl Context<'_> {
* WORD_SIZE) as _;
let dst = ValueLocation::Stack(offset);
self.copy_value(ValueLocation::Reg(reg), dst);
self.block_state.locals.register_arguments[i].add_stack(offset);
self.block_state.locals.register_locals[i].add_stack(offset);
}
}
_ => {}
@@ -1989,7 +1989,7 @@ impl Context<'_> {
// If these values were in register they've now been invalidated, since
// the callee can use them as scratch.
for loc in self.block_state.locals.register_arguments.iter_mut() {
for loc in self.block_state.locals.register_locals.iter_mut() {
if let Some(offset) = loc.stack() {
*loc = ArgLoc::Stack(offset);
}
@@ -2097,7 +2097,8 @@ impl Context<'_> {
pub fn start_function(&mut self, arguments: u32, locals: u32) -> FunctionEnd {
// To support `vmctx`
let arguments = arguments + 1;
let reg_args = &ARGS_IN_GPRS[..(arguments as usize).min(ARGS_IN_GPRS.len())];
let (reg_args, locals_in_gprs) = ARGS_IN_GPRS.split_at((arguments as usize).min(ARGS_IN_GPRS.len()));
let reg_locals = &locals_in_gprs[..(locals as usize).min(locals_in_gprs.len())];
// We need space to store the register arguments if we need to call a function
// and overwrite these registers so we add `reg_args.len()`
@@ -2107,8 +2108,8 @@ impl Context<'_> {
let aligned_stack_slots = (stack_slots + 1) & !1;
let frame_size: i32 = aligned_stack_slots as i32 * WORD_SIZE as i32;
self.block_state.locals.register_arguments =
reg_args.iter().cloned().map(ArgLoc::Register).collect();
self.block_state.locals.register_locals =
reg_args.iter().chain(reg_locals).cloned().map(ArgLoc::Register).collect();
self.block_state.locals.num_stack_args = arguments.saturating_sub(ARGS_IN_GPRS.len() as _);
self.block_state.locals.num_local_stack_slots = stack_slots;
self.block_state.return_register = Some(RAX);

View File

@@ -40,6 +40,17 @@ impl ControlFrameKind {
}
}
fn end_labels(&self) -> impl Iterator<Item = Label> {
self.block_end()
.into_iter()
.chain(if let ControlFrameKind::IfTrue { if_not, .. } = self {
// this is `if .. end` construction. Define the `if_not` label.
Some(*if_not)
} else {
None
})
}
fn is_loop(&self) -> bool {
match *self {
ControlFrameKind::Loop { .. } => true,
@@ -174,12 +185,18 @@ pub fn translate(
return_ty,
));
let mut operators = itertools::put_back(operators.into_iter());
// TODO: We want to make this a state machine (maybe requires 1-element lookahead? Not sure) so that we
// can coelesce multiple `end`s and optimise break-at-end-of-block into noop.
// TODO: Does coelescing multiple `end`s matter since at worst this really only elides a single move at
// the end of a function, and this is probably a no-op anyway due to register renaming.
for op in operators {
let op = op?;
loop {
let op = if let Some(op) = operators.next() {
op?
} else {
break;
};
match op {
Operator::End | Operator::Else => {}
@@ -306,28 +323,60 @@ pub fn translate(
//
// This doesn't require lookahead but it does require turning this loop into
// a kind of state machine.
let control_frame = control_frames.pop().expect("control stack is never empty");
let mut control_frame = control_frames.pop().expect("control stack is never empty");
let mut labels = control_frame
.kind
.end_labels()
.collect::<Vec<_>>();
let mut end = control_frame.block_state.end_locals.take();
// Fold `End`s together to prevent unnecessary shuffling of locals
loop {
let op = if let Some(op) = operators.next() {
op?
} else {
break;
};
match op {
Operator::End => {
control_frame =
control_frames.pop().expect("control stack is never empty");
labels.extend(control_frame.kind.end_labels());
end = control_frame.block_state.end_locals.take().or(end);
}
other => {
operators.put_back(Ok(other));
break;
}
}
}
let arity = control_frame.arity();
// Don't bother generating this code if we're in unreachable code
if !control_frame.unreachable {
ctx.return_from_block(arity);
// If there are no remaining frames we've hit the end of the function - we don't need to
// restore locals since no execution will happen after this point.
if !control_frames.is_empty() {
if let Some(end) = end {
ctx.restore_locals_to(&end);
}
}
}
let block_end = control_frame.kind.block_end();
// TODO: What is the correct order of this and the `define_label`? It's clear for `block`s
// but I'm not certain for `if..then..else..end`.
ctx.end_block(control_frame.block_state, |ctx| {
if let Some(block_end) = block_end {
ctx.define_label(block_end);
for label in labels {
ctx.define_label(label);
}
});
if let ControlFrameKind::IfTrue { if_not, .. } = control_frame.kind {
// this is `if .. end` construction. Define the `if_not` label here.
ctx.define_label(if_not);
}
}
Operator::I32Eq => ctx.i32_eq(),
Operator::I32Eqz => ctx.i32_eqz(),

View File

@@ -9,7 +9,10 @@ extern crate failure;
extern crate wasmparser;
#[macro_use]
extern crate failure_derive;
#[macro_use]
extern crate memoffset;
extern crate dynasmrt;
extern crate itertools;
#[cfg(test)]
#[macro_use]
extern crate lazy_static;
@@ -28,4 +31,4 @@ mod translate_sections;
#[cfg(test)]
mod tests;
pub use module::{translate, TranslatedModule, ExecutableModule};
pub use module::{translate, ExecutableModule, TranslatedModule};

View File

@@ -254,6 +254,10 @@ impl VmCtx {
pub fn offset_of_memory() -> usize {
mem::size_of::<Self>()
}
pub fn offset_of_funcs_ptr() -> usize {
offset_of!(Self, table.ptr)
}
}
impl<T> Drop for BoxSlice<T> {

View File

@@ -698,6 +698,18 @@ fn wrong_index() {
);
}
fn iterative_fib_baseline(n: u32) -> u32 {
let (mut a, mut b) = (1, 1);
for _ in 0..n {
let old_a = a;
a = b;
b += old_a;
}
a
}
const FIBONACCI: &str = r#"
(module
(func $fib (param $n i32) (result i32)
@@ -745,25 +757,59 @@ const FIBONACCI: &str = r#"
#[test]
fn fib() {
fn fib(n: u32) -> u32 {
let (mut a, mut b) = (1, 1);
for _ in 0..n {
let old_a = a;
a = b;
b += old_a;
}
a
}
let translated = translate_wat(FIBONACCI);
translated.disassemble();
for x in 0..30 {
assert_eq!(
translated.execute_func::<_, u32>(0, (x,)),
Ok(fib(x)),
Ok(iterative_fib_baseline(x)),
"Failed for x={}",
x
);
}
}
// Generated by Rust for the `fib` function in `bench_fibonacci_baseline`
const FIBONACCI_OPT: &str = r"
(module
(func $fib (param $p0 i32) (result i32)
(local $l1 i32)
(set_local $l1
(i32.const 1))
(block $B0
(br_if $B0
(i32.lt_u
(get_local $p0)
(i32.const 2)))
(set_local $l1
(i32.const 1))
(loop $L1
(set_local $l1
(i32.add
(call $fib
(i32.add
(get_local $p0)
(i32.const -1)))
(get_local $l1)))
(br_if $L1
(i32.gt_u
(tee_local $p0
(i32.add
(get_local $p0)
(i32.const -2)))
(i32.const 1)))))
(get_local $l1)))";
#[test]
fn fib_opt() {
let translated = translate_wat(FIBONACCI_OPT);
translated.disassemble();
for x in 0..30 {
assert_eq!(
translated.execute_func::<_, u32>(0, (x,)),
Ok(iterative_fib_baseline(x)),
"Failed for x={}",
x
);
@@ -940,7 +986,7 @@ fn bench_fibonacci_compile(b: &mut test::Bencher) {
#[bench]
fn bench_fibonacci_run(b: &mut test::Bencher) {
let wasm = wabt::wat2wasm(FIBONACCI).unwrap();
let wasm = wabt::wat2wasm(FIBONACCI_OPT).unwrap();
let module = translate(&wasm).unwrap();
b.iter(|| module.execute_func::<_, u32>(0, (20,)));