Expose memory-related options in Config (#1513)

* Expose memory-related options in `Config` This commit was initially motivated by looking more into #1501, but it ended up balooning a bit after finding a few issues. The high-level items in this commit are: * New configuration options via `wasmtime::Config` are exposed to configure the tunable limits of how memories are allocated and such. * The `MemoryCreator` trait has been updated to accurately reflect the required allocation characteristics that JIT code expects. * A bug has been fixed in the cranelift wasm code generation where if no guard page was present bounds checks weren't accurately performed. The new `Config` methods allow tuning the memory allocation characteristics of wasmtime. Currently 64-bit platforms will reserve 6GB chunks of memory for each linear memory, but by tweaking various config options you can change how this is allocate, perhaps at the cost of slower JIT code since it needs more bounds checks. The methods are intended to be pretty thoroughly documented as to the effect they have on the JIT code and what values you may wish to select. These new methods have been added to the spectest fuzzer to ensure that various configuration values for these methods don't affect correctness. The `MemoryCreator` trait previously only allocated memories with a `MemoryType`, but this didn't actually reflect the guarantees that JIT code expected. JIT code is generated with an assumption about the minimum size of the guard region, as well as whether memory is static or dynamic (whether the base pointer can be relocated). These properties must be upheld by custom allocation engines for JIT code to perform correctly, so extra parameters have been added to `MemoryCreator::new_memory` to reflect this. Finally the fuzzing with `Config` turned up an issue where if no guard pages present the wasm code wouldn't correctly bounds-check memory accesses. The issue here was that with a guard page we only need to bounds-check the first byte of access, but without a guard page we need to bounds-check the last byte of access. This meant that the code generation needed to account for the size of the memory operation (load/store) and use this as the offset-to-check in the no-guard-page scenario. I've attempted to make the various comments in cranelift a bit more exhaustive too to hopefully make it a bit clearer for future readers! Closes #1501 * Review comments * Update a comment
2020-04-29 19:10:00 -05:00
parent bc4b4707e3
commit 363cd2d20f
11 changed files with 430 additions and 52 deletions
--- a/cranelift/codegen/src/legalizer/heap.rs
+++ b/cranelift/codegen/src/legalizer/heap.rs
@@ -114,7 +114,9 @@ fn static_addr(
    let mut pos = FuncCursor::new(func).at_inst(inst);
    pos.use_srcloc(inst);

-    // Start with the bounds check. Trap if `offset + access_size > bound`.
+    // The goal here is to trap if `offset + access_size > bound`.
+    //
+    // This first case is a trivial case where we can easily trap.
    if access_size > bound {
        // This will simply always trap since `offset >= 0`.
        pos.ins().trap(ir::TrapCode::HeapOutOfBounds);
@@ -129,11 +131,21 @@ fn static_addr(
        return;
    }

-    // Check `offset > limit` which is now known non-negative.
+    // After the trivial case is done we're now mostly interested in trapping
+    // if `offset > bound - access_size`. We know `bound - access_size` here is
+    // non-negative from the above comparison.
+    //
+    // If we can know `bound - access_size >= 4GB` then with a 32-bit offset
+    // we're guaranteed:
+    //
+    //      bound - access_size >= 4GB > offset
+    //
+    // or, in other words, `offset < bound - access_size`, meaning we can't trap
+    // for any value of `offset`.
+    //
+    // With that we have an optimization here where with 32-bit offsets and
+    // `bound - access_size >= 4GB` we can omit a bounds check.
    let limit = bound - access_size;
-
-    // We may be able to omit the check entirely for 32-bit offsets if the heap bound is 4 GB or
-    // more.
    if offset_ty != ir::types::I32 || limit < 0xffff_ffff {
        let oob = if limit & 1 == 1 {
            // Prefer testing `offset >= limit - 1` when limit is odd because an even number is
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -39,6 +39,8 @@ use cranelift_codegen::ir::{
 };
 use cranelift_codegen::packed_option::ReservedValue;
 use cranelift_frontend::{FunctionBuilder, Variable};
+use std::cmp;
+use std::convert::TryFrom;
 use std::vec::Vec;
 use wasmparser::{MemoryImmediate, Operator};

@@ -655,42 +657,42 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        Operator::I16x8Load8x8S {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().sload8x8(flags, base, offset);
            state.push1(loaded);
        }
        Operator::I16x8Load8x8U {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().uload8x8(flags, base, offset);
            state.push1(loaded);
        }
        Operator::I32x4Load16x4S {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().sload16x4(flags, base, offset);
            state.push1(loaded);
        }
        Operator::I32x4Load16x4U {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().uload16x4(flags, base, offset);
            state.push1(loaded);
        }
        Operator::I64x2Load32x2S {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().sload32x2(flags, base, offset);
            state.push1(loaded);
        }
        Operator::I64x2Load32x2U {
            memarg: MemoryImmediate { flags: _, offset },
        } => {
-            let (flags, base, offset) = prepare_load(*offset, builder, state, environ)?;
+            let (flags, base, offset) = prepare_load(*offset, 8, builder, state, environ)?;
            let loaded = builder.ins().uload32x2(flags, base, offset);
            state.push1(loaded);
        }
@@ -1701,25 +1703,70 @@ fn get_heap_addr(
    heap: ir::Heap,
    addr32: ir::Value,
    offset: u32,
+    width: u32,
    addr_ty: Type,
    builder: &mut FunctionBuilder,
 ) -> (ir::Value, i32) {
-    use core::cmp::min;
-
-    let mut adjusted_offset = u64::from(offset);
    let offset_guard_size: u64 = builder.func.heaps[heap].offset_guard_size.into();

-    // Generate `heap_addr` instructions that are friendly to CSE by checking offsets that are
-    // multiples of the offset-guard size. Add one to make sure that we check the pointer itself
-    // is in bounds.
-    if offset_guard_size != 0 {
-        adjusted_offset = adjusted_offset / offset_guard_size * offset_guard_size;
-    }
-
-    // For accesses on the outer skirts of the offset-guard pages, we expect that we get a trap
-    // even if the access goes beyond the offset-guard pages. This is because the first byte
-    // pointed to is inside the offset-guard pages.
-    let check_size = min(u64::from(u32::MAX), 1 + adjusted_offset) as u32;
+    // How exactly the bounds check is performed here and what it's performed
+    // on is a bit tricky. Generally we want to rely on access violations (e.g.
+    // segfaults) to generate traps since that means we don't have to bounds
+    // check anything explicitly.
+    //
+    // If we don't have a guard page of unmapped memory, though, then we can't
+    // rely on this trapping behavior through segfaults. Instead we need to
+    // bounds-check the entire memory access here which is everything from
+    // `addr32 + offset` to `addr32 + offset + width` (not inclusive). In this
+    // scenario our adjusted offset that we're checking is `offset + width`.
+    //
+    // If we have a guard page, however, then we can perform a further
+    // optimization of the generated code by only checking multiples of the
+    // offset-guard size to be more CSE-friendly. Knowing that we have at least
+    // 1 page of a guard page we're then able to disregard the `width` since we
+    // know it's always less than one page. Our bounds check will be for the
+    // first byte which will either succeed and be guaranteed to fault if it's
+    // actually out of bounds, or the bounds check itself will fail. In any case
+    // we assert that the width is reasonably small for now so this assumption
+    // can be adjusted in the future if we get larger widths.
+    //
+    // Put another way we can say, where `y < offset_guard_size`:
+    //
+    //      n * offset_guard_size + y = offset
+    //
+    // We'll then pass `n * offset_guard_size` as the bounds check value. If
+    // this traps then our `offset` would have trapped anyway. If this check
+    // passes we know
+    //
+    //      addr32 + n * offset_guard_size < bound
+    //
+    // which means
+    //
+    //      addr32 + n * offset_guard_size + y < bound + offset_guard_size
+    //
+    // because `y < offset_guard_size`, which then means:
+    //
+    //      addr32 + offset < bound + offset_guard_size
+    //
+    // Since we know that that guard size bytes are all unmapped we're
+    // guaranteed that `offset` and the `width` bytes after it are either
+    // in-bounds or will hit the guard page, meaning we'll get the desired
+    // semantics we want.
+    //
+    // As one final comment on the bits with the guard size here, another goal
+    // of this is to hit an optimization in `heap_addr` where if the heap size
+    // minus the offset is >= 4GB then bounds checks are 100% eliminated. This
+    // means that with huge guard regions (e.g. our 2GB default) most adjusted
+    // offsets we're checking here are zero. This means that we'll hit the fast
+    // path and emit zero conditional traps for bounds checks
+    let adjusted_offset = if offset_guard_size == 0 {
+        u64::from(offset) + u64::from(width)
+    } else {
+        assert!(width < 1024);
+        cmp::max(u64::from(offset) / offset_guard_size * offset_guard_size, 1)
+    };
+    debug_assert!(adjusted_offset > 0); // want to bounds check at least 1 byte
+    let check_size = u32::try_from(adjusted_offset).unwrap_or(u32::MAX);
    let base = builder.ins().heap_addr(addr_ty, heap, addr32, check_size);

    // Native load/store instructions take a signed `Offset32` immediate, so adjust the base
@@ -1736,6 +1783,7 @@ fn get_heap_addr(
 /// Prepare for a load; factors out common functionality between load and load_extend operations.
 fn prepare_load<FE: FuncEnvironment + ?Sized>(
    offset: u32,
+    loaded_bytes: u32,
    builder: &mut FunctionBuilder,
    state: &mut FuncTranslationState,
    environ: &mut FE,
@@ -1744,7 +1792,14 @@ fn prepare_load<FE: FuncEnvironment + ?Sized>(

    // We don't yet support multiple linear memories.
    let heap = state.get_heap(builder.func, 0, environ)?;
-    let (base, offset) = get_heap_addr(heap, addr32, offset, environ.pointer_type(), builder);
+    let (base, offset) = get_heap_addr(
+        heap,
+        addr32,
+        offset,
+        loaded_bytes,
+        environ.pointer_type(),
+        builder,
+    );

    // Note that we don't set `is_aligned` here, even if the load instruction's
    // alignment immediate says it's aligned, because WebAssembly's immediate
@@ -1763,7 +1818,13 @@ fn translate_load<FE: FuncEnvironment + ?Sized>(
    state: &mut FuncTranslationState,
    environ: &mut FE,
 ) -> WasmResult<()> {
-    let (flags, base, offset) = prepare_load(offset, builder, state, environ)?;
+    let (flags, base, offset) = prepare_load(
+        offset,
+        mem_op_size(opcode, result_ty),
+        builder,
+        state,
+        environ,
+    )?;
    let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
    state.push1(dfg.first_result(load));
    Ok(())
@@ -1782,7 +1843,14 @@ fn translate_store<FE: FuncEnvironment + ?Sized>(

    // We don't yet support multiple linear memories.
    let heap = state.get_heap(builder.func, 0, environ)?;
-    let (base, offset) = get_heap_addr(heap, addr32, offset, environ.pointer_type(), builder);
+    let (base, offset) = get_heap_addr(
+        heap,
+        addr32,
+        offset,
+        mem_op_size(opcode, val_ty),
+        environ.pointer_type(),
+        builder,
+    );
    // See the comments in `translate_load` about the flags.
    let flags = MemFlags::new();
    builder
@@ -1791,6 +1859,16 @@ fn translate_store<FE: FuncEnvironment + ?Sized>(
    Ok(())
 }

+fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
+    match opcode {
+        ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
+        ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
+        ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
+        ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
+        _ => panic!("unknown size of mem op for {:?}", opcode),
+    }
+}
+
 fn translate_icmp(cc: IntCC, builder: &mut FunctionBuilder, state: &mut FuncTranslationState) {
    let (arg0, arg1) = state.pop2();
    let val = builder.ins().icmp(cc, arg0, arg1);
--- a/crates/api/src/externals.rs
+++ b/crates/api/src/externals.rs
@@ -870,8 +870,59 @@ pub unsafe trait LinearMemory {
 /// Note that this is a relatively new and experimental feature and it is recommended
 /// to be familiar with wasmtime runtime code to use it.
 pub unsafe trait MemoryCreator: Send + Sync {
-    /// Create new LinearMemory
-    fn new_memory(&self, ty: MemoryType) -> Result<Box<dyn LinearMemory>, String>;
+    /// Create a new `LinearMemory` object from the specified parameters.
+    ///
+    /// The type of memory being created is specified by `ty` which indicates
+    /// both the minimum and maximum size, in wasm pages.
+    ///
+    /// The `reserved_size` value indicates the expected size of the
+    /// reservation that is to be made for this memory. If this value is `None`
+    /// than the implementation is free to allocate memory as it sees fit. If
+    /// the value is `Some`, however, then the implementation is expected to
+    /// reserve that many bytes for the memory's allocation, plus the guard
+    /// size at the end. Note that this reservation need only be a virtual
+    /// memory reservation, physical memory does not need to be allocated
+    /// immediately. In this case `grow` should never move the base pointer and
+    /// the maximum size of `ty` is guaranteed to fit within `reserved_size`.
+    ///
+    /// The `guard_size` parameter indicates how many bytes of space, after the
+    /// memory allocation, is expected to be unmapped. JIT code will elide
+    /// bounds checks based on the `guard_size` provided, so for JIT code to
+    /// work correctly the memory returned will need to be properly guarded with
+    /// `guard_size` bytes left unmapped after the base allocation.
+    ///
+    /// Note that the `reserved_size` and `guard_size` options are tuned from
+    /// the various [`Config`](crate::Config) methods about memory
+    /// sizes/guards. Additionally these two values are guaranteed to be
+    /// multiples of the system page size.
+    fn new_memory(
+        &self,
+        ty: MemoryType,
+        reserved_size: Option<u64>,
+        guard_size: u64,
+    ) -> Result<Box<dyn LinearMemory>, String>;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::*;
+
+    // Assert that creating a memory via `Memory::new` respects the limits/tunables
+    // in `Config`.
+    #[test]
+    fn respect_tunables() {
+        let mut cfg = Config::new();
+        cfg.static_memory_maximum_size(0)
+            .dynamic_memory_guard_size(0);
+        let store = Store::new(&Engine::new(&cfg));
+        let ty = MemoryType::new(Limits::new(1, None));
+        let mem = Memory::new(&store, ty);
+        assert_eq!(mem.wasmtime_export.memory.offset_guard_size, 0);
+        match mem.wasmtime_export.memory.style {
+            wasmtime_environ::MemoryStyle::Dynamic => {}
+            other => panic!("unexpected style {:?}", other),
+        }
+    }
 }

 // Exports
--- a/crates/api/src/runtime.rs
+++ b/crates/api/src/runtime.rs
@@ -2,7 +2,8 @@ use crate::externals::MemoryCreator;
 use crate::trampoline::{MemoryCreatorProxy, StoreInstanceHandle};
 use anyhow::{bail, Result};
 use std::cell::RefCell;
-use std::cmp::min;
+use std::cmp;
+use std::convert::TryFrom;
 use std::fmt;
 use std::path::Path;
 use std::rc::{Rc, Weak};
@@ -45,9 +46,9 @@ impl Config {
        if cfg!(windows) {
            // For now, use a smaller footprint on Windows so that we don't
            // don't outstrip the paging file.
-            tunables.static_memory_bound = min(tunables.static_memory_bound, 0x100);
+            tunables.static_memory_bound = cmp::min(tunables.static_memory_bound, 0x100);
            tunables.static_memory_offset_guard_size =
-                min(tunables.static_memory_offset_guard_size, 0x10000);
+                cmp::min(tunables.static_memory_offset_guard_size, 0x10000);
        }

        let mut flags = settings::builder();
@@ -402,6 +403,183 @@ impl Config {
        self.memory_creator = Some(MemoryCreatorProxy { mem_creator });
        self
    }
+
+    /// Configures the maximum size, in bytes, where a linear memory is
+    /// considered static, above which it'll be considered dynamic.
+    ///
+    /// This function configures the threshold for wasm memories whether they're
+    /// implemented as a dynamically relocatable chunk of memory or a statically
+    /// located chunk of memory. The `max_size` parameter here is the size, in
+    /// bytes, where if the maximum size of a linear memory is below `max_size`
+    /// then it will be statically allocated with enough space to never have to
+    /// move. If the maximum size of a linear memory is larger than `max_size`
+    /// then wasm memory will be dynamically located and may move in memory
+    /// through growth operations.
+    ///
+    /// Specifying a `max_size` of 0 means that all memories will be dynamic and
+    /// may be relocated through `memory.grow`. Also note that if any wasm
+    /// memory's maximum size is below `max_size` then it will still reserve
+    /// `max_size` bytes in the virtual memory space.
+    ///
+    /// ## Static vs Dynamic Memory
+    ///
+    /// Linear memories represent contiguous arrays of bytes, but they can also
+    /// be grown through the API and wasm instructions. When memory is grown if
+    /// space hasn't been preallocated then growth may involve relocating the
+    /// base pointer in memory. Memories in Wasmtime are classified in two
+    /// different ways:
+    ///
+    /// * **static** - these memories preallocate all space necessary they'll
+    ///   ever need, meaning that the base pointer of these memories is never
+    ///   moved. Static memories may take more virtual memory space because of
+    ///   pre-reserving space for memories.
+    ///
+    /// * **dynamic** - these memories are not preallocated and may move during
+    ///   growth operations. Dynamic memories consume less virtual memory space
+    ///   because they don't need to preallocate space for future growth.
+    ///
+    /// Static memories can be optimized better in JIT code because once the
+    /// base address is loaded in a function it's known that we never need to
+    /// reload it because it never changes, `memory.grow` is generally a pretty
+    /// fast operation because the wasm memory is never relocated, and under
+    /// some conditions bounds checks can be elided on memory accesses.
+    ///
+    /// Dynamic memories can't be quite as heavily optimized because the base
+    /// address may need to be reloaded more often, they may require relocating
+    /// lots of data on `memory.grow`, and dynamic memories require
+    /// unconditional bounds checks on all memory accesses.
+    ///
+    /// ## Should you use static or dynamic memory?
+    ///
+    /// In general you probably don't need to change the value of this property.
+    /// The defaults here are optimized for each target platform to consume a
+    /// reasonable amount of physical memory while also generating speedy
+    /// machine code.
+    ///
+    /// One of the main reasons you may want to configure this today is if your
+    /// environment can't reserve virtual memory space for each wasm linear
+    /// memory. On 64-bit platforms wasm memories require a 6GB reservation by
+    /// default, and system limits may prevent this in some scenarios. In this
+    /// case you may wish to force memories to be allocated dynamically meaning
+    /// that the virtual memory footprint of creating a wasm memory should be
+    /// exactly what's used by the wasm itself.
+    ///
+    /// For 32-bit memories a static memory must contain at least 4GB of
+    /// reserved address space plus a guard page to elide any bounds checks at
+    /// all. Smaller static memories will use similar bounds checks as dynamic
+    /// memories.
+    ///
+    /// ## Default
+    ///
+    /// The default value for this property depends on the host platform. For
+    /// 64-bit platforms there's lots of address space available, so the default
+    /// configured here is 4GB. WebAssembly linear memories currently max out at
+    /// 4GB which means that on 64-bit platforms Wasmtime by default always uses
+    /// a static memory. This, coupled with a sufficiently sized guard region,
+    /// should produce the fastest JIT code on 64-bit platforms, but does
+    /// require a large address space reservation for each wasm memory.
+    ///
+    /// For 32-bit platforms this value defaults to 1GB. This means that wasm
+    /// memories whose maximum size is less than 1GB will be allocated
+    /// statically, otherwise they'll be considered dynamic.
+    pub fn static_memory_maximum_size(&mut self, max_size: u64) -> &mut Self {
+        let max_pages = max_size / u64::from(wasmtime_environ::WASM_PAGE_SIZE);
+        self.tunables.static_memory_bound = u32::try_from(max_pages).unwrap_or(u32::max_value());
+        self
+    }
+
+    /// Configures the size, in bytes, of the guard region used at the end of a
+    /// static memory's address space reservation.
+    ///
+    /// All WebAssembly loads/stores are bounds-checked and generate a trap if
+    /// they're out-of-bounds. Loads and stores are often very performance
+    /// critical, so we want the bounds check to be as fast as possible!
+    /// Accelerating these memory accesses is the motivation for a guard after a
+    /// memory allocation.
+    ///
+    /// Memories (both static and dynamic) can be configured with a guard at the
+    /// end of them which consists of unmapped virtual memory. This unmapped
+    /// memory will trigger a memory access violation (e.g. segfault) if
+    /// accessed. This allows JIT code to elide bounds checks if it can prove
+    /// that an access, if out of bounds, would hit the guard region. This means
+    /// that having such a guard of unmapped memory can remove the need for
+    /// bounds checks in JIT code.
+    ///
+    /// For the difference between static and dynamic memories, see the
+    /// [`Config::static_memory_maximum_size`].
+    ///
+    /// ## How big should the guard be?
+    ///
+    /// In general, like with configuring `static_memory_maximum_size`, you
+    /// probably don't want to change this value from the defaults. Otherwise,
+    /// though, the size of the guard region affects the number of bounds checks
+    /// needed for generated wasm code. More specifically, loads/stores with
+    /// immediate offsets will generate bounds checks based on how big the guard
+    /// page is.
+    ///
+    /// For 32-bit memories a 4GB static memory is required to even start
+    /// removing bounds checks. A 4GB guard size will guarantee that the module
+    /// has zero bounds checks for memory accesses. A 2GB guard size will
+    /// eliminate all bounds checks with an immediate offset less than 2GB. A
+    /// guard size of zero means that all memory accesses will still have bounds
+    /// checks.
+    ///
+    /// ## Default
+    ///
+    /// The default value for this property is 2GB on 64-bit platforms. This
+    /// allows eliminating almost all bounds checks on loads/stores with an
+    /// immediate offset of less than 2GB. On 32-bit platforms this defaults to
+    /// 64KB.
+    ///
+    /// ## Static vs Dynamic Guard Size
+    ///
+    /// Note that for now the static memory guard size must be at least as large
+    /// as the dynamic memory guard size, so configuring this property to be
+    /// smaller than the dynamic memory guard size will have no effect.
+    pub fn static_memory_guard_size(&mut self, guard_size: u64) -> &mut Self {
+        let guard_size = round_up_to_pages(guard_size);
+        let guard_size = cmp::max(guard_size, self.tunables.dynamic_memory_offset_guard_size);
+        self.tunables.static_memory_offset_guard_size = guard_size;
+        self
+    }
+
+    /// Configures the size, in bytes, of the guard region used at the end of a
+    /// dynamic memory's address space reservation.
+    ///
+    /// For the difference between static and dynamic memories, see the
+    /// [`Config::static_memory_maximum_size`]
+    ///
+    /// For more information about what a guard is, see the documentation on
+    /// [`Config::static_memory_guard_size`].
+    ///
+    /// Note that the size of the guard region for dynamic memories is not super
+    /// critical for performance. Making it reasonably-sized can improve
+    /// generated code slightly, but for maximum performance you'll want to lean
+    /// towards static memories rather than dynamic anyway.
+    ///
+    /// Also note that the dynamic memory guard size must be smaller than the
+    /// static memory guard size, so if a large dynamic memory guard is
+    /// specified then the static memory guard size will also be automatically
+    /// increased.
+    ///
+    /// ## Default
+    ///
+    /// This value defaults to 64KB.
+    pub fn dynamic_memory_guard_size(&mut self, guard_size: u64) -> &mut Self {
+        let guard_size = round_up_to_pages(guard_size);
+        self.tunables.dynamic_memory_offset_guard_size = guard_size;
+        self.tunables.static_memory_offset_guard_size =
+            cmp::max(guard_size, self.tunables.static_memory_offset_guard_size);
+        self
+    }
+}
+
+fn round_up_to_pages(val: u64) -> u64 {
+    let page_size = region::page::size() as u64;
+    debug_assert!(page_size.is_power_of_two());
+    val.checked_add(page_size - 1)
+        .map(|val| val & !(page_size - 1))
+        .unwrap_or(u64::max_value() / page_size + 1)
 }

 impl Default for Config {
--- a/crates/api/src/trampoline/memory.rs
+++ b/crates/api/src/trampoline/memory.rs
@@ -5,7 +5,7 @@ use crate::Store;
 use crate::{Limits, MemoryType};
 use anyhow::Result;
 use wasmtime_environ::entity::PrimaryMap;
-use wasmtime_environ::{wasm, EntityIndex, MemoryPlan, Module, WASM_PAGE_SIZE};
+use wasmtime_environ::{wasm, EntityIndex, MemoryPlan, MemoryStyle, Module, WASM_PAGE_SIZE};
 use wasmtime_runtime::{RuntimeLinearMemory, RuntimeMemoryCreator, VMMemoryDefinition};

 use std::sync::Arc;
@@ -21,9 +21,9 @@ pub fn create_handle_with_memory(
        maximum: memory.limits().max(),
        shared: false, // TODO
    };
-    let tunable = Default::default();

-    let memory_plan = wasmtime_environ::MemoryPlan::for_memory(memory, &tunable);
+    let memory_plan =
+        wasmtime_environ::MemoryPlan::for_memory(memory, &store.engine().config().tunables);
    let memory_id = module.local.memory_plans.push(memory_plan);
    module
        .exports
@@ -67,8 +67,12 @@ pub(crate) struct MemoryCreatorProxy {
 impl RuntimeMemoryCreator for MemoryCreatorProxy {
    fn new_memory(&self, plan: &MemoryPlan) -> Result<Box<dyn RuntimeLinearMemory>, String> {
        let ty = MemoryType::new(Limits::new(plan.memory.minimum, plan.memory.maximum));
+        let reserved_size = match plan.style {
+            MemoryStyle::Static { bound } => Some(bound.into()),
+            MemoryStyle::Dynamic => None,
+        };
        self.mem_creator
-            .new_memory(ty)
+            .new_memory(ty, reserved_size, plan.offset_guard_size)
            .map(|mem| Box::new(LinearMemoryProxy { mem }) as Box<dyn RuntimeLinearMemory>)
    }
 }
--- a/crates/c-api/include/wasmtime.h
+++ b/crates/c-api/include/wasmtime.h
@@ -60,6 +60,9 @@ WASMTIME_CONFIG_PROP(wasmtime_error_t*, strategy, wasmtime_strategy_t)
 WASMTIME_CONFIG_PROP(void, cranelift_debug_verifier, bool)
 WASMTIME_CONFIG_PROP(void, cranelift_opt_level, wasmtime_opt_level_t)
 WASMTIME_CONFIG_PROP(wasmtime_error_t*, profiler, wasmtime_profiling_strategy_t)
+WASMTIME_CONFIG_PROP(void, static_memory_maximum_size, uint64_t)
+WASMTIME_CONFIG_PROP(void, static_memory_guard_size, uint64_t)
+WASMTIME_CONFIG_PROP(void, dynamic_memory_guard_size, uint64_t)

 WASM_API_EXTERN wasmtime_error_t* wasmtime_config_cache_config_load(wasm_config_t*, const char*);

--- a/crates/c-api/src/config.rs
+++ b/crates/c-api/src/config.rs
@@ -146,3 +146,18 @@ pub unsafe extern "C" fn wasmtime_config_cache_config_load(
        |_cfg| {},
    )
 }
+
+#[no_mangle]
+pub extern "C" fn wasmtime_config_static_memory_maximum_size_set(c: &mut wasm_config_t, size: u64) {
+    c.config.static_memory_maximum_size(size);
+}
+
+#[no_mangle]
+pub extern "C" fn wasmtime_config_static_memory_guard_size(c: &mut wasm_config_t, size: u64) {
+    c.config.static_memory_guard_size(size);
+}
+
+#[no_mangle]
+pub extern "C" fn wasmtime_config_dynamic_memory_guard_size(c: &mut wasm_config_t, size: u64) {
+    c.config.dynamic_memory_guard_size(size);
+}
--- a/crates/environ/src/tunables.rs
+++ b/crates/environ/src/tunables.rs
@@ -48,7 +48,7 @@ impl Default for Tunables {
            /// Size in bytes of the offset guard for dynamic memories.
            ///
            /// Allocate a small guard to optimize common cases but without
-            /// wasting too much memor.
+            /// wasting too much memory.
            dynamic_memory_offset_guard_size: 0x1_0000,

            debug_info: false,
--- a/crates/fuzzing/src/generators.rs
+++ b/crates/fuzzing/src/generators.rs
@@ -105,8 +105,13 @@ pub struct Config {
    debug_verifier: bool,
    debug_info: bool,
    canonicalize_nans: bool,
-    spectest: usize,
    interruptable: bool,
+
+    // Note that we use 32-bit values here to avoid blowing the 64-bit address
+    // space by requesting ungodly-large sizes/guards.
+    static_memory_maximum_size: Option<u32>,
+    static_memory_guard_size: Option<u32>,
+    dynamic_memory_guard_size: Option<u32>,
 }

 impl Config {
@@ -114,6 +119,9 @@ impl Config {
    pub fn to_wasmtime(&self) -> wasmtime::Config {
        let mut cfg = wasmtime::Config::new();
        cfg.debug_info(self.debug_info)
+            .static_memory_maximum_size(self.static_memory_maximum_size.unwrap_or(0).into())
+            .static_memory_guard_size(self.static_memory_guard_size.unwrap_or(0).into())
+            .dynamic_memory_guard_size(self.dynamic_memory_guard_size.unwrap_or(0).into())
            .cranelift_nan_canonicalization(self.canonicalize_nans)
            .cranelift_debug_verifier(self.debug_verifier)
            .cranelift_opt_level(self.opt_level.to_wasmtime())
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -152,6 +152,19 @@ struct CommonOptions {
        default_value = "2",
    )]
    opt_level: wasmtime::OptLevel,
+
+    /// Maximum size in bytes of wasm memory before it becomes dynamically
+    /// relocatable instead of up-front-reserved.
+    #[structopt(long)]
+    static_memory_maximum_size: Option<u64>,
+
+    /// Byte size of the guard region after static memories are allocated.
+    #[structopt(long)]
+    static_memory_guard_size: Option<u64>,
+
+    /// Byte size of the guard region after dynamic memories are allocated.
+    #[structopt(long)]
+    dynamic_memory_guard_size: Option<u64>,
 }

 impl CommonOptions {
@@ -178,6 +191,15 @@ impl CommonOptions {
                }
            }
        }
+        if let Some(max) = self.static_memory_maximum_size {
+            config.static_memory_maximum_size(max);
+        }
+        if let Some(size) = self.static_memory_guard_size {
+            config.static_memory_guard_size(size);
+        }
+        if let Some(size) = self.dynamic_memory_guard_size {
+            config.dynamic_memory_guard_size(size);
+        }
        Ok(config)
    }

--- a/tests/all/memory_creator.rs
+++ b/tests/all/memory_creator.rs
@@ -108,7 +108,14 @@ mod not_for_windows {
    }

    unsafe impl MemoryCreator for CustomMemoryCreator {
-        fn new_memory(&self, ty: MemoryType) -> Result<Box<dyn LinearMemory>, String> {
+        fn new_memory(
+            &self,
+            ty: MemoryType,
+            reserved_size: Option<u64>,
+            guard_size: u64,
+        ) -> Result<Box<dyn LinearMemory>, String> {
+            assert_eq!(guard_size, 0);
+            assert!(reserved_size.is_none());
            let max = ty.limits().max().unwrap_or(WASM_MAX_PAGES);
            unsafe {
                let mem = Box::new(CustomMemory::new(
@@ -122,14 +129,19 @@ mod not_for_windows {
        }
    }

+    fn config() -> (Store, Arc<CustomMemoryCreator>) {
+        let mem_creator = Arc::new(CustomMemoryCreator::new());
+        let mut config = Config::new();
+        config
+            .with_host_memory(mem_creator.clone())
+            .static_memory_maximum_size(0)
+            .dynamic_memory_guard_size(0);
+        (Store::new(&Engine::new(&config)), mem_creator)
+    }
+
    #[test]
    fn host_memory() -> anyhow::Result<()> {
-        let mem_creator = Arc::new(CustomMemoryCreator::new());
-        let mut config = Config::default();
-        config.with_host_memory(mem_creator.clone());
-        let engine = Engine::new(&config);
-        let store = Store::new(&engine);
-
+        let (store, mem_creator) = config();
        let module = Module::new(
            &store,
            r#"
@@ -147,12 +159,7 @@ mod not_for_windows {

    #[test]
    fn host_memory_grow() -> anyhow::Result<()> {
-        let mem_creator = Arc::new(CustomMemoryCreator::new());
-        let mut config = Config::default();
-        config.with_host_memory(mem_creator.clone());
-        let engine = Engine::new(&config);
-        let store = Store::new(&engine);
-
+        let (store, mem_creator) = config();
        let module = Module::new(
            &store,
            r#"