Enable simd_extmul_* for AArch64

Lower simd_extmul_[low/high][signed/unsigned] to [s|u]widen inputs to
an imul node.

Copyright (c) 2021, Arm Limited.
This commit is contained in:
Sam Parker
2021-07-08 16:39:27 +01:00
parent 65378422bf
commit 541a4ee428
8 changed files with 745 additions and 269 deletions

View File

@@ -287,6 +287,22 @@ fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -
| machreg_to_vec(rd.to_reg())
}
fn enc_vec_rrr_long(q: u32, u: u32, size: u32, bit14: u32, rm: Reg, rn: Reg, rd: Writable<Reg>) -> u32 {
debug_assert_eq!(q & 0b1, q);
debug_assert_eq!(u & 0b1, u);
debug_assert_eq!(size & 0b11, size);
debug_assert_eq!(bit14 & 0b1, bit14);
0b0_0_0_01110_00_1_00000_100000_00000_00000
| q << 30
| u << 29
| size << 22
| bit14 << 14
| (machreg_to_vec(rm) << 16)
| (machreg_to_vec(rn) << 5)
| machreg_to_vec(rd.to_reg())
}
fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
(0b01011010110 << 21)
| size << 31
@@ -2173,6 +2189,26 @@ impl MachInstEmit for Inst {
sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
}
&Inst::VecRRRLong {
rd,
rn,
rm,
alu_op,
high_half,
} => {
let (u, size, bit14) = match alu_op {
VecRRRLongOp::Smull8 => (0b0, 0b00, 0b1),
VecRRRLongOp::Smull16 => (0b0, 0b01, 0b1),
VecRRRLongOp::Smull32 => (0b0, 0b10, 0b1),
VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1),
VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1),
VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1),
VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0),
VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0),
VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0),
};
sink.put4(enc_vec_rrr_long(high_half as u32, u, size, bit14, rm, rn, rd));
}
&Inst::VecRRR {
rd,
rn,
@@ -2242,13 +2278,7 @@ impl MachInstEmit for Inst {
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Umlal => {
debug_assert!(!size.is_128bits());
(0b001_01110_00_1 | enc_size << 1, 0b100000)
}
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
VecALUOp::Sqrdmulh => {
debug_assert!(
size.lane_size() == ScalarSize::Size16
@@ -2258,12 +2288,12 @@ impl MachInstEmit for Inst {
(0b001_01110_00_1 | enc_size << 1, 0b101101)
}
};
let top11 = match alu_op {
VecALUOp::Smull | VecALUOp::Smull2 => top11,
_ if is_float => top11 | (q << 9) | enc_float_size << 1,
_ => top11 | (q << 9),
let top11 = if is_float {
top11 | enc_float_size << 1
} else {
top11
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate { rd, rn, size } => {
let (q, size) = size.enc_size();

View File

@@ -3651,18 +3651,6 @@ fn test_aarch64_binemit() {
"addp v8.4s, v12.4s, v14.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Umlal,
rd: writable_vreg(9),
rn: vreg(20),
rm: vreg(17),
size: VectorSize::Size32x2,
},
"8982B12E",
"umlal v9.2d, v20.2s, v17.2s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Zip1,
@@ -3712,77 +3700,221 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull8,
rd: writable_vreg(16),
rn: vreg(12),
rm: vreg(1),
size: VectorSize::Size8x16,
high_half: false
},
"90C1210E",
"smull v16.8h, v12.8b, v1.8b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull8,
rd: writable_vreg(15),
rn: vreg(11),
rm: vreg(2),
high_half: false
},
"6FC1222E",
"umull v15.8h, v11.8b, v2.8b",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal8,
rd: writable_vreg(4),
rn: vreg(8),
rm: vreg(16),
high_half: false
},
"0481302E",
"umlal v4.8h, v8.8b, v16.8b",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull16,
rd: writable_vreg(2),
rn: vreg(13),
rm: vreg(6),
size: VectorSize::Size16x8,
high_half: false,
},
"A2C1660E",
"smull v2.4s, v13.4h, v6.4h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull16,
rd: writable_vreg(3),
rn: vreg(14),
rm: vreg(7),
high_half: false,
},
"C3C1672E",
"umull v3.4s, v14.4h, v7.4h",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal16,
rd: writable_vreg(7),
rn: vreg(14),
rm: vreg(21),
high_half: false,
},
"C781752E",
"umlal v7.4s, v14.4h, v21.4h",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull32,
rd: writable_vreg(8),
rn: vreg(12),
rm: vreg(14),
size: VectorSize::Size32x4,
high_half: false,
},
"88C1AE0E",
"smull v8.2d, v12.2s, v14.2s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull32,
rd: writable_vreg(9),
rn: vreg(5),
rm: vreg(6),
high_half: false,
},
"A9C0A62E",
"umull v9.2d, v5.2s, v6.2s",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal32,
rd: writable_vreg(9),
rn: vreg(20),
rm: vreg(17),
high_half: false,
},
"8982B12E",
"umlal v9.2d, v20.2s, v17.2s",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull8,
rd: writable_vreg(16),
rn: vreg(12),
rm: vreg(1),
size: VectorSize::Size8x16,
high_half: true,
},
"90C1214E",
"smull2 v16.8h, v12.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull8,
rd: writable_vreg(29),
rn: vreg(22),
rm: vreg(10),
high_half: true,
},
"DDC22A6E",
"umull2 v29.8h, v22.16b, v10.16b",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal8,
rd: writable_vreg(1),
rn: vreg(5),
rm: vreg(15),
high_half: true,
},
"A1802F6E",
"umlal2 v1.8h, v5.16b, v15.16b",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull16,
rd: writable_vreg(2),
rn: vreg(13),
rm: vreg(6),
size: VectorSize::Size16x8,
high_half: true,
},
"A2C1664E",
"smull2 v2.4s, v13.8h, v6.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull16,
rd: writable_vreg(19),
rn: vreg(18),
rm: vreg(17),
high_half: true,
},
"53C2716E",
"umull2 v19.4s, v18.8h, v17.8h",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal16,
rd: writable_vreg(11),
rn: vreg(10),
rm: vreg(12),
high_half: true,
},
"4B816C6E",
"umlal2 v11.4s, v10.8h, v12.8h",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Smull32,
rd: writable_vreg(8),
rn: vreg(12),
rm: vreg(14),
size: VectorSize::Size32x4,
high_half: true,
},
"88C1AE4E",
"smull2 v8.2d, v12.4s, v14.4s",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umull32,
rd: writable_vreg(4),
rn: vreg(12),
rm: vreg(16),
high_half: true,
},
"84C1B06E",
"umull2 v4.2d, v12.4s, v16.4s",
));
insns.push((
Inst::VecRRRLong {
alu_op: VecRRRLongOp::Umlal32,
rd: writable_vreg(10),
rn: vreg(29),
rm: vreg(2),
high_half: true,
},
"AA83A26E",
"umlal2 v10.2d, v29.4s, v2.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Sqrdmulh,

View File

@@ -303,14 +303,8 @@ pub enum VecALUOp {
Fmul,
/// Add pairwise
Addp,
/// Unsigned multiply add long
Umlal,
/// Zip vectors (primary) [meaning, high halves]
Zip1,
/// Signed multiply long (low halves)
Smull,
/// Signed multiply long (high halves)
Smull2,
/// Signed saturating rounding doubling multiply returning high half
Sqrdmulh,
}
@@ -402,6 +396,23 @@ pub enum VecRRNarrowOp {
Fcvtn64,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum VecRRRLongOp {
/// Signed multiply long.
Smull8,
Smull16,
Smull32,
/// Unsigned multiply long.
Umull8,
Umull16,
Umull32,
/// Unsigned multiply add long
Umlal8,
Umlal16,
Umlal32,
}
/// A vector operation on a pair of elements with one register.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum VecPairOp {
@@ -1087,6 +1098,16 @@ pub enum Inst {
rn: Reg,
},
/// 2-operand vector instruction that produces a result with twice the
/// lane width and half the number of lanes.
VecRRRLong {
alu_op: VecRRRLongOp,
rd: Writable<Reg>,
rn: Reg,
rm: Reg,
high_half: bool,
},
/// A vector ALU op.
VecRRR {
alu_op: VecALUOp,
@@ -2134,10 +2155,22 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::VecRRRLong {
alu_op, rd, rn, rm, ..
} => {
match alu_op {
VecRRRLongOp::Umlal8
| VecRRRLongOp::Umlal16
| VecRRRLongOp::Umlal32 => collector.add_mod(rd),
_ => collector.add_def(rd),
};
collector.add_use(rn);
collector.add_use(rm);
}
&Inst::VecRRR {
alu_op, rd, rn, rm, ..
} => {
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
if alu_op == VecALUOp::Bsl {
collector.add_mod(rd);
} else {
collector.add_def(rd);
@@ -2944,6 +2977,22 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::VecRRRLong {
alu_op,
ref mut rd,
ref mut rn,
ref mut rm,
..
} => {
match alu_op {
VecRRRLongOp::Umlal8
| VecRRRLongOp::Umlal16
| VecRRRLongOp::Umlal32 => map_mod(mapper, rd),
_ => map_def(mapper, rd),
};
map_use(mapper, rn);
map_use(mapper, rm);
}
&mut Inst::VecRRR {
alu_op,
ref mut rd,
@@ -2951,7 +3000,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
ref mut rm,
..
} => {
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
if alu_op == VecALUOp::Bsl {
map_mod(mapper, rd);
} else {
map_def(mapper, rd);
@@ -4147,24 +4196,62 @@ impl Inst {
VecALUOp::Fmin => ("fmin", size),
VecALUOp::Fmul => ("fmul", size),
VecALUOp::Addp => ("addp", size),
VecALUOp::Umlal => ("umlal", size),
VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Smull => ("smull", size),
VecALUOp::Smull2 => ("smull2", size),
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
};
let rd_size = match alu_op {
VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
_ => size,
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
let rn = show_vreg_vector(rn, mb_rru, size);
let rm = show_vreg_vector(rm, mb_rru, size);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecRRRLong {
rd,
rn,
rm,
alu_op,
high_half,
} => {
let (op, dest_size, src_size) = match (alu_op, high_half) {
(VecRRRLongOp::Smull8, false) =>
("smull", VectorSize::Size16x8, VectorSize::Size8x8),
(VecRRRLongOp::Smull8, true) =>
("smull2", VectorSize::Size16x8, VectorSize::Size8x16),
(VecRRRLongOp::Smull16, false) =>
("smull", VectorSize::Size32x4, VectorSize::Size16x4),
(VecRRRLongOp::Smull16, true) =>
("smull2", VectorSize::Size32x4, VectorSize::Size16x8),
(VecRRRLongOp::Smull32, false) =>
("smull", VectorSize::Size64x2, VectorSize::Size32x2),
(VecRRRLongOp::Smull32, true) =>
("smull2", VectorSize::Size64x2, VectorSize::Size32x4),
(VecRRRLongOp::Umull8, false) =>
("umull", VectorSize::Size16x8, VectorSize::Size8x8),
(VecRRRLongOp::Umull8, true) =>
("umull2", VectorSize::Size16x8, VectorSize::Size8x16),
(VecRRRLongOp::Umull16, false) =>
("umull", VectorSize::Size32x4, VectorSize::Size16x4),
(VecRRRLongOp::Umull16, true) =>
("umull2", VectorSize::Size32x4, VectorSize::Size16x8),
(VecRRRLongOp::Umull32, false) =>
("umull", VectorSize::Size64x2, VectorSize::Size32x2),
(VecRRRLongOp::Umull32, true) =>
("umull2", VectorSize::Size64x2, VectorSize::Size32x4),
(VecRRRLongOp::Umlal8, false) =>
("umlal", VectorSize::Size16x8, VectorSize::Size8x8),
(VecRRRLongOp::Umlal8, true) =>
("umlal2", VectorSize::Size16x8, VectorSize::Size8x16),
(VecRRRLongOp::Umlal16, false) =>
("umlal", VectorSize::Size32x4, VectorSize::Size16x4),
(VecRRRLongOp::Umlal16, true) =>
("umlal2", VectorSize::Size32x4, VectorSize::Size16x8),
(VecRRRLongOp::Umlal32, false) =>
("umlal", VectorSize::Size64x2, VectorSize::Size32x2),
(VecRRRLongOp::Umlal32, true) =>
("umlal2", VectorSize::Size64x2, VectorSize::Size32x4),
};
let rn_size = match alu_op {
VecALUOp::Smull => size.halve(),
_ => size,
};
let rm_size = rn_size;
let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
let rn = show_vreg_vector(rn, mb_rru, rn_size);
let rm = show_vreg_vector(rm, mb_rru, rm_size);
let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
let rn = show_vreg_vector(rn, mb_rru, src_size);
let rm = show_vreg_vector(rm, mb_rru, src_size);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecMisc { op, rd, rn, size } => {