Provide optimised codegen for small libc calls

2018-10-18 15:07:08 +01:00
parent 07c98f5330
commit e999c53eaa
1 changed files with 161 additions and 7 deletions
--- a/lib/frontend/src/frontend.rs
+++ b/lib/frontend/src/frontend.rs
@@ -6,7 +6,8 @@ use cranelift_codegen::ir::function::DisplayFunction;
 use cranelift_codegen::ir::{
    types, AbiParam, DataFlowGraph, Ebb, ExtFuncData, ExternalName, FuncRef, Function, GlobalValue,
    GlobalValueData, Heap, HeapData, Inst, InstBuilder, InstBuilderBase, InstructionData,
-    JumpTable, JumpTableData, LibCall, SigRef, Signature, StackSlot, StackSlotData, Type, Value,
+    JumpTable, JumpTableData, LibCall, MemFlags, SigRef, Signature, StackSlot, StackSlotData, Type,
+    Value,
 };
 use cranelift_codegen::isa::TargetIsa;
 use cranelift_codegen::packed_option::PackedOption;
@@ -567,10 +568,51 @@ impl<'a> FunctionBuilder<'a> {
        self.ins().call(libc_memcpy, &[dest, src, size]);
    }

+    /// Optimised memcpy for small copys.
+    pub fn emit_small_memcpy(
+        &mut self,
+        isa: &TargetIsa,
+        dest: Value,
+        src: Value,
+        size: u64,
+        dest_align: u8,
+        src_align: u8,
+    ) {
+        // Currently the result of guess work, not actual profiling.
+        const THRESHOLD: u64 = 4;
+
+        let access_size = greatest_divisible_power_of_two(size);
+        assert!(
+            access_size.is_power_of_two(),
+            "`size` is not a power of two"
+        );
+        assert!(
+            access_size >= ::std::cmp::min(src_align, dest_align) as u64,
+            "`size` is smaller than `dest` and `src`'s alignment value."
+        );
+        let load_and_store_amount = size / access_size;
+
+        if load_and_store_amount > THRESHOLD {
+            let size_value = self.ins().iconst(isa.pointer_type(), size as i64);
+            self.call_memcpy(isa, dest, src, size_value);
+            return;
+        }
+
+        let mut flags = MemFlags::new();
+        flags.set_aligned();
+
+        for i in 0..load_and_store_amount {
+            let offset = (access_size * i) as i32;
+            let int_type = Type::int(access_size as u16).unwrap();
+            let value = self.ins().load(int_type, flags, src, offset);
+            self.ins().store(flags, value, dest, offset);
+        }
+    }
+
    /// Calls libc.memset
    ///
-    /// Writes `len` bytes of value `ch` to memory starting at `buffer`.
-    pub fn call_memset(&mut self, isa: &TargetIsa, buffer: Value, ch: Value, len: Value) {
+    /// Writes `size` bytes of value `ch` to memory starting at `buffer`.
+    pub fn call_memset(&mut self, isa: &TargetIsa, buffer: Value, ch: Value, size: Value) {
        let pointer_type = isa.pointer_type();
        let signature = {
            let mut s = Signature::new(isa.flags().call_conv());
@@ -587,14 +629,68 @@ impl<'a> FunctionBuilder<'a> {
        });

        let ch = self.ins().uextend(types::I32, ch);
-        self.ins().call(libc_memset, &[buffer, ch, len]);
+        self.ins().call(libc_memset, &[buffer, ch, size]);
+    }
+
+    /// Calls libc.memset
+    ///
+    /// Writes `size` bytes of value `ch` to memory starting at `buffer`.
+    pub fn emit_small_memset(
+        &mut self,
+        isa: &TargetIsa,
+        buffer: Value,
+        ch: u32,
+        size: u64,
+        buffer_align: u8,
+    ) {
+        // Currently the result of guess work, not actual profiling.
+        const THRESHOLD: u64 = 4;
+
+        let access_size = greatest_divisible_power_of_two(size);
+        assert!(
+            access_size.is_power_of_two(),
+            "`size` is not a power of two"
+        );
+        assert!(
+            access_size >= buffer_align as u64,
+            "`size` is smaller than `dest` and `src`'s alignment value."
+        );
+        let load_and_store_amount = size / access_size;
+
+        if load_and_store_amount > THRESHOLD {
+            let ch = self.ins().iconst(types::I32, ch as i64);
+            let size = self.ins().iconst(isa.pointer_type(), size as i64);
+            self.call_memset(isa, buffer, ch, size);
+        } else {
+            let mut flags = MemFlags::new();
+            flags.set_aligned();
+
+            let ch = ch as u64;
+            let int_type = Type::int(access_size as u16).unwrap();
+            let raw_value = if int_type == types::I64 {
+                (ch << 32) | (ch << 16) | (ch << 8) | ch
+            } else if int_type == types::I32 {
+                (ch << 16) | (ch << 8) | ch
+            } else if int_type == types::I16 {
+                (ch << 8) | ch
+            } else {
+                assert_eq!(int_type, types::I8);
+                ch
+            };
+
+            let value = self.ins().iconst(int_type, raw_value as i64);
+            for i in 0..load_and_store_amount {
+                let offset = (access_size * i) as i32;
+                self.ins().store(flags, value, buffer, offset);
+            }
+        }
    }

    /// Calls libc.memmove
    ///
-    /// Copies `len` bytes from memory starting at `source` to memory starting
+    /// Copies `size` bytes from memory starting at `source` to memory starting
    /// at `dest`. `source` is always read before writing to `dest`.
-    pub fn call_memmove(&mut self, isa: &TargetIsa, dest: Value, source: Value, num: Value) {
+    pub fn call_memmove(&mut self, isa: &TargetIsa, dest: Value, source: Value, size: Value) {
        let pointer_type = isa.pointer_type();
        let signature = {
            let mut s = Signature::new(isa.flags().call_conv());
@@ -610,8 +706,57 @@ impl<'a> FunctionBuilder<'a> {
            colocated: false,
        });

-        self.ins().call(libc_memmove, &[dest, source, num]);
+        self.ins().call(libc_memmove, &[dest, source, size]);
    }
+
+    /// Optimised memmove for small moves.
+    pub fn emit_small_memmove(
+        &mut self,
+        isa: &TargetIsa,
+        dest: Value,
+        src: Value,
+        size: u64,
+        dest_align: u8,
+        src_align: u8,
+    ) {
+        // Currently the result of guess work, not actual profiling.
+        const THRESHOLD: u64 = 4;
+
+        let access_size = greatest_divisible_power_of_two(size);
+        assert!(
+            access_size.is_power_of_two(),
+            "`size` is not a power of two"
+        );
+        assert!(
+            access_size >= ::std::cmp::min(src_align, dest_align) as u64,
+            "`size` is smaller than `dest` and `src`'s alignment value."
+        );
+        let load_and_store_amount = size / access_size;
+
+        if load_and_store_amount > THRESHOLD {
+            let size_value = self.ins().iconst(isa.pointer_type(), size as i64);
+            self.call_memmove(isa, dest, src, size_value);
+            return;
+        }
+
+        let mut flags = MemFlags::new();
+        flags.set_aligned();
+
+        // Load all of the memory first in case `dest` overlaps.
+        let registers: Vec<_> = (0..load_and_store_amount)
+            .map(|i| {
+                let offset = (access_size * i) as i32;
+                (self.ins().load(types::I8, flags, src, offset), offset)
+            }).collect();
+
+        for (value, offset) in registers {
+            self.ins().store(flags, value, dest, offset);
+        }
+    }
+}
+
+fn greatest_divisible_power_of_two(size: u64) -> u64 {
+    (size as i64 & -(size as i64)) as u64
 }

 // Helper functions
@@ -648,6 +793,7 @@ impl<'a> FunctionBuilder<'a> {

 #[cfg(test)]
 mod tests {
+    use super::greatest_divisible_power_of_two;
    use cranelift_codegen::entity::EntityRef;
    use cranelift_codegen::ir::types::*;
    use cranelift_codegen::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
@@ -821,4 +967,12 @@ ebb0:
 "
        );
    }
+
+    #[test]
+    fn test_greatest_divisible_power_of_two() {
+        assert_eq!(64, greatest_divisible_power_of_two(64));
+        assert_eq!(16, greatest_divisible_power_of_two(48));
+        assert_eq!(8, greatest_divisible_power_of_two(24));
+        assert_eq!(1, greatest_divisible_power_of_two(25));
+    }
 }