aarch64: Add more lowerings for the CLIF fma (#6150)

This commit adds new lowerings to the AArch64 backend of the element-based `fmla` and `fmls` instructions. These instructions have one of the multiplicands as an implicit broadcast of a single lane of another register and can help remove `shuffle` or `dup` instructions that would otherwise be used to implement them.
2023-04-05 12:22:55 -05:00
parent bf741955f0
commit 967543eb43
8 changed files with 321 additions and 15 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2914,6 +2914,45 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
            }
+            &Inst::VecFmlaElem {
+                rd,
+                ri,
+                rn,
+                rm,
+                alu_op,
+                size,
+                idx,
+            } => {
+                let rd = allocs.next_writable(rd);
+                let ri = allocs.next(ri);
+                debug_assert_eq!(rd.to_reg(), ri);
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+                let idx = u32::from(idx);
+
+                let (q, _size) = size.enc_size();
+                let o2 = match alu_op {
+                    VecALUModOp::Fmla => 0b0,
+                    VecALUModOp::Fmls => 0b1,
+                    _ => unreachable!(),
+                };
+
+                let (h, l) = match size {
+                    VectorSize::Size32x4 => {
+                        assert!(idx < 4);
+                        (idx >> 1, idx & 1)
+                    }
+                    VectorSize::Size64x2 => {
+                        assert!(idx < 2);
+                        (idx, 0)
+                    }
+                    _ => unreachable!(),
+                };
+
+                let top11 = 0b000_011111_00 | (q << 9) | (size.enc_float_size() << 1) | l;
+                let bit15_10 = 0b000100 | (o2 << 4) | (h << 1);
+                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
+            }
            &Inst::VecLoadReplicate {
                rd,
                rn,