From ab1cf3df2da2544fb55ada5541f69efb4ca613bd Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Tue, 2 Aug 2022 15:53:44 -0700 Subject: [PATCH] Use a `SmallVec` for `ABIArg`s (#4584) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of a regular `Vec`. These vectors are usually very small, for example here is the histogram of sizes when running Sightglass's `pulldown-cmark` benchmark: ``` ;; Number of samples = 10332 ;; Min = 0 ;; Max = 11 ;; ;; Mean = 2.496128532713901 ;; Standard deviation = 2.2859559855427243 ;; Variance = 5.225594767838607 ;; ;; Each ∎ is a count of 62 ;; 0 .. 1 [ 3134 ]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ 1 .. 2 [ 2032 ]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ 2 .. 3 [ 159 ]: ∎∎ 3 .. 4 [ 838 ]: ∎∎∎∎∎∎∎∎∎∎∎∎∎ 4 .. 5 [ 970 ]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ 5 .. 6 [ 2566 ]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ 6 .. 7 [ 303 ]: ∎∎∎∎ 7 .. 8 [ 272 ]: ∎∎∎∎ 8 .. 9 [ 40 ]: 9 .. 10 [ 18 ]: ``` By using a `SmallVec` with capacity of 6 we avoid the vast majority of heap allocations and get some nice benchmark wins of up to ~1.11x faster compilation.

Sightglass Benchmark Results

``` compilation :: nanoseconds :: benchmarks/spidermonkey/benchmark.wasm Δ = 340361395.90 ± 63384608.15 (confidence = 99%) main.so is 0.88x to 0.92x faster than smallvec.so! smallvec.so is 1.09x to 1.13x faster than main.so! [3101467423 3425524333.41 4060621653] main.so [2820915877 3085162937.51 3375167352] smallvec.so compilation :: cycles :: benchmarks/spidermonkey/benchmark.wasm Δ = 988446098.59 ± 184075718.89 (confidence = 99%) main.so is 0.88x to 0.92x faster than smallvec.so! smallvec.so is 1.09x to 1.13x faster than main.so! [9006994951 9948091070.66 11792481990] main.so [8192243090 8959644972.07 9801848982] smallvec.so compilation :: nanoseconds :: benchmarks/bz2/benchmark.wasm Δ = 7854567.87 ± 2215491.16 (confidence = 99%) main.so is 0.89x to 0.94x faster than smallvec.so! smallvec.so is 1.07x to 1.12x faster than main.so! [80354527 93864666.76 119789198] main.so [77554917 86010098.89 94726994] smallvec.so compilation :: cycles :: benchmarks/bz2/benchmark.wasm Δ = 22810509.85 ± 6434024.63 (confidence = 99%) main.so is 0.89x to 0.94x faster than smallvec.so! smallvec.so is 1.07x to 1.12x faster than main.so! [233358190 272593088.57 347880715] main.so [225227821 249782578.72 275097380] smallvec.so compilation :: nanoseconds :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 10849521.41 ± 4324757.85 (confidence = 99%) main.so is 0.90x to 0.96x faster than smallvec.so! smallvec.so is 1.04x to 1.10x faster than main.so! [133875427 156859544.47 222455440] main.so [126073854 146010023.06 181611647] smallvec.so compilation :: cycles :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 31508176.97 ± 12559561.91 (confidence = 99%) main.so is 0.90x to 0.96x faster than smallvec.so! smallvec.so is 1.04x to 1.10x faster than main.so! [388788638 455536988.31 646034523] main.so [366132033 424028811.34 527419755] smallvec.so ``` --- cranelift/codegen/src/isa/aarch64/abi.rs | 4 ++-- cranelift/codegen/src/isa/s390x/abi.rs | 4 ++-- cranelift/codegen/src/isa/x64/abi.rs | 4 ++-- cranelift/codegen/src/machinst/abi_impl.rs | 9 ++++++--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 0fcc181aa0..e57866c307 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -85,7 +85,7 @@ impl ABIMachineSpec for AArch64MachineDeps { params: &[ir::AbiParam], args_or_rets: ArgsOrRets, add_ret_area_ptr: bool, - ) -> CodegenResult<(Vec, i64, Option)> { + ) -> CodegenResult<(ABIArgVec, i64, Option)> { let is_apple_cc = call_conv.extends_apple_aarch64(); // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4. @@ -105,7 +105,7 @@ impl ABIMachineSpec for AArch64MachineDeps { let mut next_xreg = 0; let mut next_vreg = 0; let mut next_stack: u64 = 0; - let mut ret = vec![]; + let mut ret = ABIArgVec::new(); let (max_per_class_reg_vals, mut remaining_reg_vals) = match args_or_rets { ArgsOrRets::Args => (8, 16), // x0-x7 and v0-v7 diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index de95a733f2..75ee9557b6 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -224,12 +224,12 @@ impl ABIMachineSpec for S390xMachineDeps { params: &[ir::AbiParam], args_or_rets: ArgsOrRets, add_ret_area_ptr: bool, - ) -> CodegenResult<(Vec, i64, Option)> { + ) -> CodegenResult<(ABIArgVec, i64, Option)> { let mut next_gpr = 0; let mut next_fpr = 0; let mut next_vr = 0; let mut next_stack: u64 = 0; - let mut ret = vec![]; + let mut ret = ABIArgVec::new(); if args_or_rets == ArgsOrRets::Args { next_stack = REG_SAVE_AREA_SIZE as u64; diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 616ea12b45..9202e2c82f 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -47,14 +47,14 @@ impl ABIMachineSpec for X64ABIMachineSpec { params: &[ir::AbiParam], args_or_rets: ArgsOrRets, add_ret_area_ptr: bool, - ) -> CodegenResult<(Vec, i64, Option)> { + ) -> CodegenResult<(ABIArgVec, i64, Option)> { let is_fastcall = call_conv.extends_windows_fastcall(); let mut next_gpr = 0; let mut next_vreg = 0; let mut next_stack: u64 = 0; let mut next_param_idx = 0; // Fastcall cares about overall param index - let mut ret = vec![]; + let mut ret = ABIArgVec::new(); if args_or_rets == ArgsOrRets::Args && is_fastcall { // Fastcall always reserves 32 bytes of shadow space corresponding to diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs index 203efe78cd..dd2e860386 100644 --- a/cranelift/codegen/src/machinst/abi_impl.rs +++ b/cranelift/codegen/src/machinst/abi_impl.rs @@ -310,7 +310,7 @@ pub trait ABIMachineSpec { params: &[ir::AbiParam], args_or_rets: ArgsOrRets, add_ret_area_ptr: bool, - ) -> CodegenResult<(Vec, i64, Option)>; + ) -> CodegenResult<(ABIArgVec, i64, Option)>; /// Returns the offset from FP to the argument area, i.e., jumping over the saved FP, return /// address, and maybe other standard elements depending on ABI (e.g. Wasm TLS reg). @@ -499,15 +499,18 @@ pub trait ABIMachineSpec { ) -> ir::ArgumentExtension; } +// A vector of `ABIArg`s with inline capacity, since they are typically small. +pub type ABIArgVec = SmallVec<[ABIArg; 6]>; + /// ABI information shared between body (callee) and caller. #[derive(Clone)] pub struct ABISig { /// Argument locations (regs or stack slots). Stack offsets are relative to /// SP on entry to function. - args: Vec, + args: ABIArgVec, /// Return-value locations. Stack offsets are relative to the return-area /// pointer. - rets: Vec, + rets: ABIArgVec, /// Space on stack used to store arguments. sized_stack_arg_space: i64, /// Space on stack used to store return values.