From b10f8cf3228aa609ce525ab8f8d42484c282998b Mon Sep 17 00:00:00 2001 From: Artur Jamro Date: Tue, 6 Aug 2019 17:19:26 -0700 Subject: [PATCH] Partial hashing of module for faster caching (#221) * Simple module compilation cache * Fix base64 encoding bug * Use warn! everywhere in cache system * Remove unused import * Temporary workaround for long path on Windows * Remove unused import for non-windows builds * Add command line argument to enable cache system + apply minor review feedback * Initial implementation of partial module hashing * Proper module hashing for the cache * Use newer version of cranelift --- wasmtime-environ/src/cache.rs | 77 +++++++++++++++++--------- wasmtime-environ/src/cranelift.rs | 8 ++- wasmtime-environ/src/module.rs | 45 +++++++++++---- wasmtime-environ/src/module_environ.rs | 7 +-- 4 files changed, 94 insertions(+), 43 deletions(-) diff --git a/wasmtime-environ/src/cache.rs b/wasmtime-environ/src/cache.rs index 7e8ebb92b8..a32f2e7d88 100644 --- a/wasmtime-environ/src/cache.rs +++ b/wasmtime-environ/src/cache.rs @@ -1,13 +1,17 @@ use crate::address_map::ModuleAddressMap; use crate::compilation::{CodeAndJTOffsets, Compilation, Relocations}; use crate::module::Module; -use cranelift_codegen::ir; -use cranelift_codegen::isa; +use crate::module_environ::FunctionBodyData; +use core::hash::Hasher; +use cranelift_codegen::{ir, isa}; +use cranelift_entity::PrimaryMap; +use cranelift_wasm::DefinedFuncIndex; use directories::ProjectDirs; use lazy_static::lazy_static; use log::{debug, warn}; use serde::de::{self, Deserialize, Deserializer, MapAccess, SeqAccess, Visitor}; use serde::ser::{self, Serialize, SerializeSeq, SerializeStruct, Serializer}; +use sha2::{Digest, Sha256}; use std::ffi::OsString; use std::fmt; use std::fs; @@ -102,36 +106,38 @@ pub struct ModuleCacheData { type ModuleCacheDataTupleType = (Compilation, Relocations, ModuleAddressMap); +struct Sha256Hasher(Sha256); + impl ModuleCacheEntry { - pub fn new( + pub fn new<'data>( module: &Module, + function_body_inputs: &PrimaryMap>, isa: &dyn isa::TargetIsa, compiler_name: &str, generate_debug_info: bool, ) -> Self { let mod_cache_path = if conf::cache_enabled() { - CACHE_DIR.clone().and_then(|p| { - module.hash.map(|hash| { - let compiler_dir = if cfg!(debug_assertions) { - format!( - "{comp_name}-{comp_ver}-{comp_mtime}", - comp_name = compiler_name, - comp_ver = env!("GIT_REV"), - comp_mtime = *SELF_MTIME, - ) - } else { - format!( - "{comp_name}-{comp_ver}", - comp_name = compiler_name, - comp_ver = env!("GIT_REV"), - ) - }; - p.join(isa.name()).join(compiler_dir).join(format!( - "mod-{mod_hash}{mod_dbg}", - mod_hash = base64::encode_config(&hash, base64::URL_SAFE_NO_PAD), // standard encoding uses '/' which can't be used for filename - mod_dbg = if generate_debug_info { ".d" } else { "" }, - )) - }) + CACHE_DIR.clone().map(|p| { + let hash = Sha256Hasher::digest(module, function_body_inputs); + let compiler_dir = if cfg!(debug_assertions) { + format!( + "{comp_name}-{comp_ver}-{comp_mtime}", + comp_name = compiler_name, + comp_ver = env!("GIT_REV"), + comp_mtime = *SELF_MTIME, + ) + } else { + format!( + "{comp_name}-{comp_ver}", + comp_name = compiler_name, + comp_ver = env!("GIT_REV"), + ) + }; + p.join(isa.name()).join(compiler_dir).join(format!( + "mod-{mod_hash}{mod_dbg}", + mod_hash = base64::encode_config(&hash, base64::URL_SAFE_NO_PAD), // standard encoding uses '/' which can't be used for filename + mod_dbg = if generate_debug_info { ".d" } else { "" }, + )) }) } else { None @@ -227,6 +233,27 @@ impl ModuleCacheData { } } +impl Sha256Hasher { + pub fn digest<'data>( + module: &Module, + function_body_inputs: &PrimaryMap>, + ) -> [u8; 32] { + let mut hasher = Self(Sha256::new()); + module.hash_for_cache(function_body_inputs, &mut hasher); + hasher.0.result().into() + } +} + +impl Hasher for Sha256Hasher { + fn finish(&self) -> u64 { + panic!("Sha256Hasher doesn't support finish!"); + } + + fn write(&mut self, bytes: &[u8]) { + self.0.input(bytes); + } +} + //-//////////////////////////////////////////////////////////////////// // Serialization and deserialization of type containing SecondaryMap // //-//////////////////////////////////////////////////////////////////// diff --git a/wasmtime-environ/src/cranelift.rs b/wasmtime-environ/src/cranelift.rs index 83e1734f85..ba8b49c76b 100644 --- a/wasmtime-environ/src/cranelift.rs +++ b/wasmtime-environ/src/cranelift.rs @@ -124,7 +124,13 @@ impl crate::compilation::Compiler for Cranelift { isa: &dyn isa::TargetIsa, generate_debug_info: bool, ) -> Result<(Compilation, Relocations, ModuleAddressMap), CompileError> { - let cache_entry = ModuleCacheEntry::new(module, isa, "cranelift", generate_debug_info); + let cache_entry = ModuleCacheEntry::new( + module, + &function_body_inputs, + isa, + "cranelift", + generate_debug_info, + ); let data = match cache_entry.get_data() { Some(data) => data, diff --git a/wasmtime-environ/src/module.rs b/wasmtime-environ/src/module.rs index 3bcdea93ba..bc7b33138f 100644 --- a/wasmtime-environ/src/module.rs +++ b/wasmtime-environ/src/module.rs @@ -1,6 +1,8 @@ //! Data structures for representing decoded wasm modules. +use crate::module_environ::FunctionBodyData; use crate::tunables::Tunables; +use core::hash::{Hash, Hasher}; use cranelift_codegen::ir; use cranelift_entity::{EntityRef, PrimaryMap}; use cranelift_wasm::{ @@ -13,7 +15,7 @@ use std::string::String; use std::vec::Vec; /// A WebAssembly table initializer. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Hash)] pub struct TableElements { /// The index of a table to initialize. pub table_index: TableIndex, @@ -26,7 +28,7 @@ pub struct TableElements { } /// An entity to export. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum Export { /// Function export. Function(FuncIndex), @@ -39,7 +41,7 @@ pub enum Export { } /// Implemenation styles for WebAssembly linear memory. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub enum MemoryStyle { /// The actual memory can be resized and moved. Dynamic, @@ -77,7 +79,7 @@ impl MemoryStyle { /// A WebAssembly linear memory description along with our chosen style for /// implementing it. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct MemoryPlan { /// The WebAssembly linear memory description. pub memory: Memory, @@ -100,7 +102,7 @@ impl MemoryPlan { } /// Implemenation styles for WebAssembly tables. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub enum TableStyle { /// Signatures are stored in the table and checked in the caller. CallerChecksSignature, @@ -115,7 +117,7 @@ impl TableStyle { /// A WebAssembly table description along with our chosen style for /// implementing it. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Hash)] pub struct TablePlan { /// The WebAssembly table description. pub table: cranelift_wasm::Table, @@ -133,6 +135,7 @@ impl TablePlan { /// A translated WebAssembly module, excluding the function bodies and /// memory initializers. +// WARNING: when modifying, make sure that `hash_for_cache` is still valid! #[derive(Debug)] pub struct Module { /// Unprocessed signatures exactly as provided by `declare_signature()`. @@ -170,10 +173,6 @@ pub struct Module { /// WebAssembly table initializers. pub table_elements: Vec, - - /// Hash of the source wasm code if this module is not synthesized. - /// TODO: this is temporary workaround. Will be replaced with derive macro. - pub hash: Option<[u8; 32]>, } impl Module { @@ -192,7 +191,6 @@ impl Module { exports: IndexMap::new(), start_func: None, table_elements: Vec::new(), - hash: None, } } @@ -283,4 +281,29 @@ impl Module { pub fn is_imported_global(&self, index: GlobalIndex) -> bool { index.index() < self.imported_globals.len() } + + /// Computes hash of the module for the purpose of caching. + pub fn hash_for_cache<'data, H>( + &self, + function_body_inputs: &PrimaryMap>, + state: &mut H, + ) where + H: Hasher, + { + // There's no need to cache names (strings), start function + // and data initializers (for both memory and tables) + self.signatures.hash(state); + self.functions.hash(state); + self.table_plans.hash(state); + self.memory_plans.hash(state); + self.globals.hash(state); + // IndexMap (self.export) iterates over values in order of item inserts + // Let's actually sort the values. + let mut exports = self.exports.values().collect::>(); + exports.sort(); + for val in exports { + val.hash(state); + } + function_body_inputs.hash(state); + } } diff --git a/wasmtime-environ/src/module_environ.rs b/wasmtime-environ/src/module_environ.rs index 8122864b68..a38359be3c 100644 --- a/wasmtime-environ/src/module_environ.rs +++ b/wasmtime-environ/src/module_environ.rs @@ -10,12 +10,12 @@ use cranelift_wasm::{ self, translate_module, DefinedFuncIndex, FuncIndex, Global, GlobalIndex, Memory, MemoryIndex, SignatureIndex, Table, TableIndex, WasmResult, }; -use sha2::{Digest, Sha256}; use std::boxed::Box; use std::string::String; use std::vec::Vec; /// Contains function data: byte code and its offset in the module. +#[derive(Hash)] pub struct FunctionBodyData<'a> { /// Body byte code. pub data: &'a [u8], @@ -80,11 +80,6 @@ impl<'data> ModuleEnvironment<'data> { pub fn translate(mut self, data: &'data [u8]) -> WasmResult> { translate_module(data, &mut self)?; - // TODO: this is temporary workaround and will be replaced with derive macro. - let mut hasher = Sha256::new(); - hasher.input(data); - self.result.module.hash = Some(hasher.result().into()); - Ok(self.result) } }