diff --git a/Cargo.toml b/Cargo.toml index 08c685fcd4..61149c7b7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,8 @@ arrayvec = "0.4" dynasm = "0.2.3" dynasmrt = "0.2.3" wasmparser = { path = "./wasmparser.rs" } +memoffset = "0.2" +itertools = "0.8" capstone = "0.5.0" failure = "0.1.3" failure_derive = "0.1.3" diff --git a/src/backend.rs b/src/backend.rs index 0d380064d8..dba915f2f1 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -384,7 +384,7 @@ pub struct Locals { /// registers that this can contain. If we need to move the argument /// out of a register (for example, because we're calling a function) /// we note that down here, so we don't have to move it back afterwards. - register_arguments: ArrayVec<[ArgLoc; ARGS_IN_GPRS.len()]>, + register_locals: ArrayVec<[ArgLoc; ARGS_IN_GPRS.len()]>, /// The number of arguments stored on the stack. num_stack_args: u32, /// The number of local stack slots, i.e. the amount of stack space reserved for locals. @@ -393,7 +393,7 @@ pub struct Locals { impl Locals { fn register(&self, index: u32) -> Option { - if index < self.register_arguments.len() as u32 { + if index < self.register_locals.len() as u32 { Some(ARGS_IN_GPRS[index as usize]) } else { None @@ -401,19 +401,19 @@ impl Locals { } fn add_pos(&mut self, index: u32, loc: ValueLocation) { - self.register_arguments[index as usize].add_loc(loc); + self.register_locals[index as usize].add_loc(loc); } fn set_pos(&mut self, index: u32, loc: ValueLocation) { - self.register_arguments[index as usize] = ArgLoc::from_loc(loc); + self.register_locals[index as usize] = ArgLoc::from_loc(loc); } fn get(&self, index: u32) -> ValueLocation { - self.register_arguments + self.register_locals .get(index as usize) .map(ArgLoc::best_loc) .unwrap_or_else(|| { - let stack_index = index - self.register_arguments.len() as u32; + let stack_index = index - self.register_locals.len() as u32; if stack_index < self.num_stack_args { ValueLocation::Stack( ((stack_index + self.num_local_stack_slots + 2) * WORD_SIZE) as _, @@ -426,7 +426,7 @@ impl Locals { } fn num_args(&self) -> u32 { - self.register_arguments.len() as u32 + self.num_stack_args + self.register_locals.len() as u32 + self.num_stack_args } fn vmctx_index(&self) -> u32 { @@ -1245,20 +1245,20 @@ impl Context<'_> { parent_block_state: BlockState, before_push_return: impl FnOnce(&mut Self), ) { - // TODO: This should currently never be called, but is important for if we want to - // have a more complex stack spilling scheme. debug_assert_eq!( self.block_state.depth, parent_block_state.depth, "Imbalanced pushes and pops" ); + // TODO: This should currently never be called, but is important for if we want to + // have a more complex stack spilling scheme. + // TODO: This should use an `end_locals`-style system where we only do this when + // control flow splits. if self.block_state.depth != parent_block_state.depth { dynasm!(self.asm ; add rsp, ((self.block_state.depth.0 - parent_block_state.depth.0) * WORD_SIZE) as i32 ); } - self.restore_locals(); - let return_reg = self.block_state.return_register; let locals = mem::replace(&mut self.block_state.locals, Default::default()); self.block_state = parent_block_state; @@ -1282,10 +1282,10 @@ impl Context<'_> { for (src, dst) in self .block_state .locals - .register_arguments + .register_locals .clone() .iter() - .zip(&locals.register_arguments) + .zip(&locals.register_locals) { self.copy_value(src.best_loc(), dst.best_loc()); } @@ -1293,9 +1293,9 @@ impl Context<'_> { for (src, dst) in self .block_state .locals - .register_arguments + .register_locals .iter_mut() - .zip(&locals.register_arguments) + .zip(&locals.register_locals) { src.union(*dst); } @@ -1715,7 +1715,7 @@ impl Context<'_> { if let Some(cur) = self .block_state .locals - .register_arguments + .register_locals .get_mut(local_idx as usize) { *cur = ArgLoc::from_loc(dst_loc); @@ -1736,7 +1736,7 @@ impl Context<'_> { if let Some(cur) = self .block_state .locals - .register_arguments + .register_locals .get_mut(local_idx as usize) { *cur = ArgLoc::from_loc(dst_loc); @@ -1821,10 +1821,10 @@ impl Context<'_> { fn free_arg_registers(&mut self, exclude: Option) { // This is bound to the maximum size of the `ArrayVec` amd so can be considered to have constant // runtime - for i in (0..self.block_state.locals.register_arguments.len()) + for i in (0..self.block_state.locals.register_locals.len()) .filter(|i| exclude != Some(*i as u32)) { - match self.block_state.locals.register_arguments[i] { + match self.block_state.locals.register_locals[i] { ArgLoc::Register(reg) => { if ARGS_IN_GPRS.contains(®) { let offset = @@ -1832,7 +1832,7 @@ impl Context<'_> { * WORD_SIZE) as _; let dst = ValueLocation::Stack(offset); self.copy_value(ValueLocation::Reg(reg), dst); - self.block_state.locals.register_arguments[i].add_stack(offset); + self.block_state.locals.register_locals[i].add_stack(offset); } } _ => {} @@ -1989,7 +1989,7 @@ impl Context<'_> { // If these values were in register they've now been invalidated, since // the callee can use them as scratch. - for loc in self.block_state.locals.register_arguments.iter_mut() { + for loc in self.block_state.locals.register_locals.iter_mut() { if let Some(offset) = loc.stack() { *loc = ArgLoc::Stack(offset); } @@ -2097,7 +2097,8 @@ impl Context<'_> { pub fn start_function(&mut self, arguments: u32, locals: u32) -> FunctionEnd { // To support `vmctx` let arguments = arguments + 1; - let reg_args = &ARGS_IN_GPRS[..(arguments as usize).min(ARGS_IN_GPRS.len())]; + let (reg_args, locals_in_gprs) = ARGS_IN_GPRS.split_at((arguments as usize).min(ARGS_IN_GPRS.len())); + let reg_locals = &locals_in_gprs[..(locals as usize).min(locals_in_gprs.len())]; // We need space to store the register arguments if we need to call a function // and overwrite these registers so we add `reg_args.len()` @@ -2107,8 +2108,8 @@ impl Context<'_> { let aligned_stack_slots = (stack_slots + 1) & !1; let frame_size: i32 = aligned_stack_slots as i32 * WORD_SIZE as i32; - self.block_state.locals.register_arguments = - reg_args.iter().cloned().map(ArgLoc::Register).collect(); + self.block_state.locals.register_locals = + reg_args.iter().chain(reg_locals).cloned().map(ArgLoc::Register).collect(); self.block_state.locals.num_stack_args = arguments.saturating_sub(ARGS_IN_GPRS.len() as _); self.block_state.locals.num_local_stack_slots = stack_slots; self.block_state.return_register = Some(RAX); diff --git a/src/function_body.rs b/src/function_body.rs index 246bc37ccf..e30b92f19a 100644 --- a/src/function_body.rs +++ b/src/function_body.rs @@ -40,6 +40,17 @@ impl ControlFrameKind { } } + fn end_labels(&self) -> impl Iterator { + self.block_end() + .into_iter() + .chain(if let ControlFrameKind::IfTrue { if_not, .. } = self { + // this is `if .. end` construction. Define the `if_not` label. + Some(*if_not) + } else { + None + }) + } + fn is_loop(&self) -> bool { match *self { ControlFrameKind::Loop { .. } => true, @@ -174,12 +185,18 @@ pub fn translate( return_ty, )); + let mut operators = itertools::put_back(operators.into_iter()); + // TODO: We want to make this a state machine (maybe requires 1-element lookahead? Not sure) so that we // can coelesce multiple `end`s and optimise break-at-end-of-block into noop. // TODO: Does coelescing multiple `end`s matter since at worst this really only elides a single move at // the end of a function, and this is probably a no-op anyway due to register renaming. - for op in operators { - let op = op?; + loop { + let op = if let Some(op) = operators.next() { + op? + } else { + break; + }; match op { Operator::End | Operator::Else => {} @@ -306,28 +323,60 @@ pub fn translate( // // This doesn't require lookahead but it does require turning this loop into // a kind of state machine. - let control_frame = control_frames.pop().expect("control stack is never empty"); + let mut control_frame = control_frames.pop().expect("control stack is never empty"); + let mut labels = control_frame + .kind + .end_labels() + .collect::>(); + + let mut end = control_frame.block_state.end_locals.take(); + + // Fold `End`s together to prevent unnecessary shuffling of locals + loop { + let op = if let Some(op) = operators.next() { + op? + } else { + break; + }; + + match op { + Operator::End => { + control_frame = + control_frames.pop().expect("control stack is never empty"); + + labels.extend(control_frame.kind.end_labels()); + + end = control_frame.block_state.end_locals.take().or(end); + } + other => { + operators.put_back(Ok(other)); + break; + } + } + } let arity = control_frame.arity(); // Don't bother generating this code if we're in unreachable code if !control_frame.unreachable { ctx.return_from_block(arity); + + // If there are no remaining frames we've hit the end of the function - we don't need to + // restore locals since no execution will happen after this point. + if !control_frames.is_empty() { + if let Some(end) = end { + ctx.restore_locals_to(&end); + } + } } - let block_end = control_frame.kind.block_end(); // TODO: What is the correct order of this and the `define_label`? It's clear for `block`s // but I'm not certain for `if..then..else..end`. ctx.end_block(control_frame.block_state, |ctx| { - if let Some(block_end) = block_end { - ctx.define_label(block_end); + for label in labels { + ctx.define_label(label); } }); - - if let ControlFrameKind::IfTrue { if_not, .. } = control_frame.kind { - // this is `if .. end` construction. Define the `if_not` label here. - ctx.define_label(if_not); - } } Operator::I32Eq => ctx.i32_eq(), Operator::I32Eqz => ctx.i32_eqz(), diff --git a/src/lib.rs b/src/lib.rs index 6c744a62e5..408211410a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,10 @@ extern crate failure; extern crate wasmparser; #[macro_use] extern crate failure_derive; +#[macro_use] +extern crate memoffset; extern crate dynasmrt; +extern crate itertools; #[cfg(test)] #[macro_use] extern crate lazy_static; @@ -28,4 +31,4 @@ mod translate_sections; #[cfg(test)] mod tests; -pub use module::{translate, TranslatedModule, ExecutableModule}; +pub use module::{translate, ExecutableModule, TranslatedModule}; diff --git a/src/module.rs b/src/module.rs index ccfda9df4b..aa27a5e027 100644 --- a/src/module.rs +++ b/src/module.rs @@ -254,6 +254,10 @@ impl VmCtx { pub fn offset_of_memory() -> usize { mem::size_of::() } + + pub fn offset_of_funcs_ptr() -> usize { + offset_of!(Self, table.ptr) + } } impl Drop for BoxSlice { diff --git a/src/tests.rs b/src/tests.rs index 3df0d6e852..b56f3a3ca6 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -698,6 +698,18 @@ fn wrong_index() { ); } + fn iterative_fib_baseline(n: u32) -> u32 { + let (mut a, mut b) = (1, 1); + + for _ in 0..n { + let old_a = a; + a = b; + b += old_a; + } + + a + } + const FIBONACCI: &str = r#" (module (func $fib (param $n i32) (result i32) @@ -745,25 +757,59 @@ const FIBONACCI: &str = r#" #[test] fn fib() { - fn fib(n: u32) -> u32 { - let (mut a, mut b) = (1, 1); - - for _ in 0..n { - let old_a = a; - a = b; - b += old_a; - } - - a - } - let translated = translate_wat(FIBONACCI); translated.disassemble(); for x in 0..30 { assert_eq!( translated.execute_func::<_, u32>(0, (x,)), - Ok(fib(x)), + Ok(iterative_fib_baseline(x)), + "Failed for x={}", + x + ); + } +} + +// Generated by Rust for the `fib` function in `bench_fibonacci_baseline` +const FIBONACCI_OPT: &str = r" +(module + (func $fib (param $p0 i32) (result i32) + (local $l1 i32) + (set_local $l1 + (i32.const 1)) + (block $B0 + (br_if $B0 + (i32.lt_u + (get_local $p0) + (i32.const 2))) + (set_local $l1 + (i32.const 1)) + (loop $L1 + (set_local $l1 + (i32.add + (call $fib + (i32.add + (get_local $p0) + (i32.const -1))) + (get_local $l1))) + (br_if $L1 + (i32.gt_u + (tee_local $p0 + (i32.add + (get_local $p0) + (i32.const -2))) + (i32.const 1))))) + (get_local $l1)))"; + +#[test] +fn fib_opt() { + let translated = translate_wat(FIBONACCI_OPT); + translated.disassemble(); + + for x in 0..30 { + assert_eq!( + translated.execute_func::<_, u32>(0, (x,)), + Ok(iterative_fib_baseline(x)), "Failed for x={}", x ); @@ -940,7 +986,7 @@ fn bench_fibonacci_compile(b: &mut test::Bencher) { #[bench] fn bench_fibonacci_run(b: &mut test::Bencher) { - let wasm = wabt::wat2wasm(FIBONACCI).unwrap(); + let wasm = wabt::wat2wasm(FIBONACCI_OPT).unwrap(); let module = translate(&wasm).unwrap(); b.iter(|| module.execute_func::<_, u32>(0, (20,)));