Enable simd_extmul_* for AArch64

Lower simd_extmul_[low/high][signed/unsigned] to [s|u]widen inputs to
an imul node.

Copyright (c) 2021, Arm Limited.
This commit is contained in:
Sam Parker
2021-07-08 16:39:27 +01:00
parent 65378422bf
commit 541a4ee428
8 changed files with 745 additions and 269 deletions

View File

@@ -1253,6 +1253,166 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
None
}
pub(crate) fn match_vec_long_mul<C: LowerCtx<I = Inst>>(
c: &mut C,
insn: IRInst,
ext_op: Opcode
) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> {
let inputs = insn_inputs(c, insn);
if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) {
if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) {
let lhs_input = insn_inputs(c, lhs)[0];
let rhs_input = insn_inputs(c, rhs)[0];
let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None);
let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None);
let lane_type = c.output_ty(insn, 0).lane_type();
match (lane_type, ext_op) {
(I16, Opcode::SwidenLow) =>
return Some((VecRRRLongOp::Smull8, rn, rm, false)),
(I16, Opcode::SwidenHigh) =>
return Some((VecRRRLongOp::Smull8, rn, rm, true)),
(I16, Opcode::UwidenLow) =>
return Some((VecRRRLongOp::Umull8, rn, rm, false)),
(I16, Opcode::UwidenHigh) =>
return Some((VecRRRLongOp::Umull8, rn, rm, true)),
(I32, Opcode::SwidenLow) =>
return Some((VecRRRLongOp::Smull16, rn, rm, false)),
(I32, Opcode::SwidenHigh) =>
return Some((VecRRRLongOp::Smull16, rn, rm, true)),
(I32, Opcode::UwidenLow) =>
return Some((VecRRRLongOp::Umull16, rn, rm, false)),
(I32, Opcode::UwidenHigh) =>
return Some((VecRRRLongOp::Umull16, rn, rm, true)),
(I64, Opcode::SwidenLow) =>
return Some((VecRRRLongOp::Smull32, rn, rm, false)),
(I64, Opcode::SwidenHigh) =>
return Some((VecRRRLongOp::Smull32, rn, rm, true)),
(I64, Opcode::UwidenLow) =>
return Some((VecRRRLongOp::Umull32, rn, rm, false)),
(I64, Opcode::UwidenHigh) =>
return Some((VecRRRLongOp::Umull32, rn, rm, true)),
_ => {},
};
}
}
None
}
pub(crate) fn lower_i64x2_mul<C: LowerCtx<I = Inst>>(
c: &mut C,
insn: IRInst,
) {
let inputs = insn_inputs(c, insn);
let outputs = insn_outputs(c, insn);
let rd = get_output_reg(c, outputs[0]).regs()[0];
let rn = put_input_in_regs(c, inputs[0]).regs()[0];
let rm = put_input_in_regs(c, inputs[1]).regs()[0];
let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap();
let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap();
// This I64X2 multiplication is performed with several 32-bit
// operations.
// 64-bit numbers x and y, can be represented as:
// x = a + 2^32(b)
// y = c + 2^32(d)
// A 64-bit multiplication is:
// x * y = ac + 2^32(ad + bc) + 2^64(bd)
// note: `2^64(bd)` can be ignored, the value is too large to fit in
// 64 bits.
// This sequence implements a I64X2 multiply, where the registers
// `rn` and `rm` are split up into 32-bit components:
// rn = |d|c|b|a|
// rm = |h|g|f|e|
//
// rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
//
// The sequence is:
// rev64 rd.4s, rm.4s
// mul rd.4s, rd.4s, rn.4s
// xtn tmp1.2s, rn.2d
// addp rd.4s, rd.4s, rd.4s
// xtn tmp2.2s, rm.2d
// shll rd.2d, rd.2s, #32
// umlal rd.2d, tmp2.2s, tmp1.2s
// Reverse the 32-bit elements in the 64-bit words.
// rd = |g|h|e|f|
c.emit(Inst::VecMisc {
op: VecMisc2::Rev64,
rd,
rn: rm,
size: VectorSize::Size32x4,
});
// Calculate the high half components.
// rd = |dg|ch|be|af|
//
// Note that this 32-bit multiply of the high half
// discards the bits that would overflow, same as
// if 64-bit operations were used. Also the Shll
// below would shift out the overflow bits anyway.
c.emit(Inst::VecRRR {
alu_op: VecALUOp::Mul,
rd,
rn: rd.to_reg(),
rm: rn,
size: VectorSize::Size32x4,
});
// Extract the low half components of rn.
// tmp1 = |c|a|
c.emit(Inst::VecRRNarrow {
op: VecRRNarrowOp::Xtn64,
rd: tmp1,
rn,
high_half: false,
});
// Sum the respective high half components.
// rd = |dg+ch|be+af||dg+ch|be+af|
c.emit(Inst::VecRRR {
alu_op: VecALUOp::Addp,
rd: rd,
rn: rd.to_reg(),
rm: rd.to_reg(),
size: VectorSize::Size32x4,
});
// Extract the low half components of rm.
// tmp2 = |g|e|
c.emit(Inst::VecRRNarrow {
op: VecRRNarrowOp::Xtn64,
rd: tmp2,
rn: rm,
high_half: false,
});
// Shift the high half components, into the high half.
// rd = |dg+ch << 32|be+af << 32|
c.emit(Inst::VecRRLong {
op: VecRRLongOp::Shll32,
rd,
rn: rd.to_reg(),
high_half: false,
});
// Multiply the low components together, and accumulate with the high
// half.
// rd = |rd[1] + cg|rd[0] + ae|
c.emit(Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal32,
rd,
rn: tmp2.to_reg(),
rm: tmp1.to_reg(),
high_half: false,
});
}
/// Specifies what [lower_icmp] should do when lowering
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum IcmpOutput {