Cranelift: Deduplicate ABI signatures during lowering (#4829)

* Cranelift: Deduplicate ABI signatures during lowering This commit creates the `SigSet` type which interns and deduplicates the ABI signatures that we create from `ir::Signature`s. The ABI signatures are now referred to indirectly via a `Sig` (which is a `cranelift_entity` ID), and we pass around a `SigSet` to anything that needs to access the actual underlying `SigData` (which is what `ABISig` used to be). I had to change a couple methods to return a `SmallInstVec` instead of emitting directly to work around what would otherwise be shared and exclusive borrows of the lowering context overlapping. I don't expect any of these to heap allocate in practice. This does not remove the often-unnecessary allocations caused by `ensure_struct_return_ptr_is_returned`. That is left for follow up work. This also opens the door for further shuffling of signature data into more efficient representations in the future, now that we have `SigSet` to store it all in one place and it is threaded through all the code. We could potentially move each signature's parameter and return vectors into one big vector shared between all signatures, for example, which could cut down on allocations and shrink the size of `SigData` since those `SmallVec`s have pretty large inline capacity. Overall, this refactoring gives a 1-7% speedup for compilation on `pulldown-cmark`: ``` compilation :: cycles :: benchmarks/pulldown-cmark/benchmark.wasm Δ = 8754213.66 ± 7526266.23 (confidence = 99%) dedupe.so is 1.01x to 1.07x faster than main.so! [191003295 234620642.20 280597986] dedupe.so [197626699 243374855.86 321816763] main.so compilation :: cycles :: benchmarks/bz2/benchmark.wasm No difference in performance. [170406200 194299792.68 253001201] dedupe.so [172071888 193230743.11 223608329] main.so compilation :: cycles :: benchmarks/spidermonkey/benchmark.wasm No difference in performance. [3870997347 4437735062.59 5216007266] dedupe.so [4019924063 4424595349.24 4965088931] main.so ``` * Use full path instead of import to avoid warnings in some build configurations Warnings will then cause CI to fail. * Move `SigSet` into `VCode`
2022-08-31 13:39:32 -07:00
parent 62c5af68b5
commit f18a1f1488
15 changed files with 455 additions and 196 deletions
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -10,14 +10,13 @@ use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
 use crate::ir::{
    types::{FFLAGS, IFLAGS},
-    ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, Function, GlobalValue,
-    GlobalValueData, Immediate, Inst, InstructionData, MemFlags, Opcode, Type, Value, ValueDef,
-    ValueLabelAssignments, ValueLabelStart,
+    ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, ExternalName, Function,
+    GlobalValue, GlobalValueData, Immediate, Inst, InstructionData, MemFlags, Opcode, RelSourceLoc,
+    Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart,
 };
-use crate::ir::{ExternalName, RelSourceLoc};
 use crate::machinst::{
    non_writable_value_regs, writable_value_regs, BlockIndex, BlockLoweringOrder, Callee,
-    LoweredBlock, MachLabel, Reg, VCode, VCodeBuilder, VCodeConstant, VCodeConstantData,
+    LoweredBlock, MachLabel, Reg, SigSet, VCode, VCodeBuilder, VCodeConstant, VCodeConstantData,
    VCodeConstants, VCodeInst, ValueRegs, Writable,
 };
 use crate::{trace, CodegenResult};
@@ -345,9 +344,11 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        abi: Callee<I::ABIMachineSpec>,
        emit_info: I::Info,
        block_order: BlockLoweringOrder,
+        sigs: SigSet,
    ) -> CodegenResult<Lower<'func, I>> {
        let constants = VCodeConstants::with_capacity(f.dfg.constants.len());
        let mut vcode = VCodeBuilder::new(
+            sigs,
            abi,
            emit_info,
            block_order,
@@ -445,6 +446,14 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        })
    }

+    pub fn sigs(&self) -> &SigSet {
+        self.vcode.sigs()
+    }
+
+    pub fn sigs_mut(&mut self) -> &mut SigSet {
+        self.vcode.sigs_mut()
+    }
+
    /// Pre-analysis: compute `value_ir_uses`. See comment on
    /// `ValueUseState` for a description of what this analysis
    /// computes.
@@ -571,7 +580,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                    continue;
                }
                let regs = writable_value_regs(self.value_regs[*param]);
-                for insn in self.vcode.abi().gen_copy_arg_to_regs(i, regs).into_iter() {
+                for insn in self
+                    .vcode
+                    .abi()
+                    .gen_copy_arg_to_regs(self.sigs(), i, regs)
+                    .into_iter()
+                {
                    self.emit(insn);
                }
                if self.abi().signature().params[i].purpose == ArgumentPurpose::StructReturn {
@@ -593,7 +607,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
                    ));
                }
            }
-            if let Some(insn) = self.vcode.abi().gen_retval_area_setup() {
+            if let Some(insn) = self.vcode.abi().gen_retval_area_setup(self.sigs()) {
                self.emit(insn);
            }
        }
@@ -606,13 +620,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            for insn in self
                .vcode
                .abi()
-                .gen_copy_regs_to_retval(i, regs)
+                .gen_copy_regs_to_retval(self.sigs(), i, regs)
                .into_iter()
            {
                self.emit(insn);
            }
        }
-        let inst = self.vcode.abi().gen_ret();
+        let inst = self.vcode.abi().gen_ret(self.sigs());
        self.emit(inst);

        // Hack: generate a virtual instruction that uses vmctx in
@@ -906,11 +920,11 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        let temps = self
            .vcode
            .abi()
-            .temps_needed()
+            .temps_needed(self.sigs())
            .into_iter()
            .map(|temp_ty| self.alloc_tmp(temp_ty).only_reg().unwrap())
            .collect::<Vec<_>>();
-        self.vcode.abi().init(temps);
+        self.vcode.init_abi(temps);

        // Get the pinned reg here (we only parameterize this function on `B`,
        // not the whole `Lower` impl).
@@ -1006,10 +1020,15 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
    }

    /// Get the `Callee`.
-    pub fn abi(&mut self) -> &mut Callee<I::ABIMachineSpec> {
+    pub fn abi(&self) -> &Callee<I::ABIMachineSpec> {
        self.vcode.abi()
    }

+    /// Get the `Callee`.
+    pub fn abi_mut(&mut self) -> &mut Callee<I::ABIMachineSpec> {
+        self.vcode.abi_mut()
+    }
+
    /// Get the (virtual) register that receives the return value. A return
    /// instruction should lower into a sequence that fills this register. (Why
    /// not allow the backend to specify its own result register for the return?