x64: Enable load-coalescing for SSE/AVX instructions (#5841)

* x64: Enable load-coalescing for SSE/AVX instructions This commit unlocks the ability to fold loads into operands of SSE and AVX instructions. This is beneficial for both function size when it happens in addition to being able to reduce register pressure. Previously this was not done because most SSE instructions require memory to be aligned. AVX instructions, however, do not have alignment requirements. The solution implemented here is one recommended by Chris which is to add a new `XmmMemAligned` newtype wrapper around `XmmMem`. All SSE instructions are now annotated as requiring an `XmmMemAligned` operand except for a new new instruction styles used specifically for instructions that don't require alignment (e.g. `movdqu`, `*sd`, and `*ss` instructions). All existing instruction helpers continue to take `XmmMem`, however. This way if an AVX lowering is chosen it can be used as-is. If an SSE lowering is chosen, however, then an automatic conversion from `XmmMem` to `XmmMemAligned` kicks in. This automatic conversion only fails for unaligned addresses in which case a load instruction is emitted and the operand becomes a temporary register instead. A number of prior `Xmm` arguments have now been converted to `XmmMem` as well. One change from this commit is that loading an unaligned operand for an SSE instruction previously would use the "correct type" of load, e.g. `movups` for f32x4 or `movup` for f64x2, but now the loading happens in a context without type information so the `movdqu` instruction is generated. According to [this stack overflow question][question] it looks like modern processors won't penalize this "wrong" choice of type when the operand is then used for f32 or f64 oriented instructions. Finally this commit improves some reuse of logic in the `put_in_*_mem*` helper to share code with `sinkable_load` and avoid duplication. With this in place some various ISLE rules have been updated as well. In the tests it can be seen that AVX-instructions are now automatically load-coalesced and use memory operands in a few cases. [question]: https://stackoverflow.com/questions/40854819/is-there-any-situation-where-using-movdqu-and-movupd-is-better-than-movups * Fix tests * Fix move-and-extend to be unaligned These don't have alignment requirements like other xmm instructions as well. Additionally add some ISA tests to ensure that their output is tested. * Review comments
2023-02-21 13:10:19 -06:00
parent c65de1f1b1
commit d82ebcc102
11 changed files with 644 additions and 323 deletions
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -100,23 +100,9 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            if let Some(imm) = to_simm32(c as i64) {
                return imm.to_reg_mem_imm();
            }
-
-            // A load from the constant pool is better than a
-            // rematerialization into a register, because it reduces
-            // register pressure.
-            let vcode_constant = self.emit_u64_le_const(c);
-            return RegMemImm::mem(SyntheticAmode::ConstantOffset(vcode_constant));
        }

-        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
-            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
-                self.lower_ctx.sink_inst(src_insn);
-                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
-                return RegMemImm::mem(amode);
-            }
-        }
-
-        RegMemImm::reg(self.put_in_reg(val))
+        self.put_in_reg_mem(val).into()
    }

    fn put_in_xmm_mem_imm(&mut self, val: Value) -> XmmMemImm {
@@ -150,7 +136,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
                .unwrap();
        }

-        XmmMem::new(RegMem::reg(self.put_in_reg(val))).unwrap()
+        XmmMem::new(self.put_in_reg_mem(val)).unwrap()
    }

    fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
@@ -164,12 +150,8 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            return RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant));
        }

-        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
-            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
-                self.lower_ctx.sink_inst(src_insn);
-                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
-                return RegMem::mem(amode);
-            }
+        if let Some(load) = self.sinkable_load(val) {
+            return self.sink_load(&load);
        }

        RegMem::reg(self.put_in_reg(val))
@@ -446,7 +428,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {

    #[inline]
    fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm {
-        r.clone().into()
+        XmmMemImm::new(r.clone().to_reg_mem().into()).unwrap()
    }

    #[inline]
@@ -997,10 +979,40 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            }
        }
    }
+
+    fn xmm_mem_to_xmm_mem_aligned(&mut self, arg: &XmmMem) -> XmmMemAligned {
+        match XmmMemAligned::new(arg.clone().into()) {
+            Some(aligned) => aligned,
+            None => match arg.clone().into() {
+                RegMem::Mem { addr } => self.load_xmm_unaligned(addr).into(),
+                _ => unreachable!(),
+            },
+        }
+    }
+
+    fn xmm_mem_imm_to_xmm_mem_aligned_imm(&mut self, arg: &XmmMemImm) -> XmmMemAlignedImm {
+        match XmmMemAlignedImm::new(arg.clone().into()) {
+            Some(aligned) => aligned,
+            None => match arg.clone().into() {
+                RegMemImm::Mem { addr } => self.load_xmm_unaligned(addr).into(),
+                _ => unreachable!(),
+            },
+        }
+    }
 }

 impl IsleContext<'_, '_, MInst, X64Backend> {
    isle_prelude_method_helpers!(X64Caller);
+
+    fn load_xmm_unaligned(&mut self, addr: SyntheticAmode) -> Xmm {
+        let tmp = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();
+        self.lower_ctx.emit(MInst::XmmUnaryRmRUnaligned {
+            op: SseOpcode::Movdqu,
+            src: XmmMem::new(RegMem::mem(addr)).unwrap(),
+            dst: Writable::from_reg(Xmm::new(tmp.to_reg()).unwrap()),
+        });
+        Xmm::new(tmp.to_reg()).unwrap()
+    }
 }

 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we