aarch64: Add more lowerings for the CLIF fma (#6150)

This commit adds new lowerings to the AArch64 backend of the
element-based `fmla` and `fmls` instructions. These instructions have
one of the multiplicands as an implicit broadcast of a single lane of
another register and can help remove `shuffle` or `dup` instructions
that would otherwise be used to implement them.
This commit is contained in:
Alex Crichton
2023-04-05 12:22:55 -05:00
committed by GitHub
parent bf741955f0
commit 967543eb43
8 changed files with 321 additions and 15 deletions

View File

@@ -2914,6 +2914,45 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecFmlaElem {
rd,
ri,
rn,
rm,
alu_op,
size,
idx,
} => {
let rd = allocs.next_writable(rd);
let ri = allocs.next(ri);
debug_assert_eq!(rd.to_reg(), ri);
let rn = allocs.next(rn);
let rm = allocs.next(rm);
let idx = u32::from(idx);
let (q, _size) = size.enc_size();
let o2 = match alu_op {
VecALUModOp::Fmla => 0b0,
VecALUModOp::Fmls => 0b1,
_ => unreachable!(),
};
let (h, l) = match size {
VectorSize::Size32x4 => {
assert!(idx < 4);
(idx >> 1, idx & 1)
}
VectorSize::Size64x2 => {
assert!(idx < 2);
(idx, 0)
}
_ => unreachable!(),
};
let top11 = 0b000_011111_00 | (q << 9) | (size.enc_float_size() << 1) | l;
let bit15_10 = 0b000100 | (o2 << 4) | (h << 1);
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate {
rd,
rn,