Merge pull request #3070 from sparker-arm/simd-extmul-aarch64
Enable simd_extmul_* for AArch64
This commit is contained in:
@@ -287,6 +287,30 @@ fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -
|
|||||||
| machreg_to_vec(rd.to_reg())
|
| machreg_to_vec(rd.to_reg())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn enc_vec_rrr_long(
|
||||||
|
q: u32,
|
||||||
|
u: u32,
|
||||||
|
size: u32,
|
||||||
|
bit14: u32,
|
||||||
|
rm: Reg,
|
||||||
|
rn: Reg,
|
||||||
|
rd: Writable<Reg>,
|
||||||
|
) -> u32 {
|
||||||
|
debug_assert_eq!(q & 0b1, q);
|
||||||
|
debug_assert_eq!(u & 0b1, u);
|
||||||
|
debug_assert_eq!(size & 0b11, size);
|
||||||
|
debug_assert_eq!(bit14 & 0b1, bit14);
|
||||||
|
|
||||||
|
0b0_0_0_01110_00_1_00000_100000_00000_00000
|
||||||
|
| q << 30
|
||||||
|
| u << 29
|
||||||
|
| size << 22
|
||||||
|
| bit14 << 14
|
||||||
|
| (machreg_to_vec(rm) << 16)
|
||||||
|
| (machreg_to_vec(rn) << 5)
|
||||||
|
| machreg_to_vec(rd.to_reg())
|
||||||
|
}
|
||||||
|
|
||||||
fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
|
fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
|
||||||
(0b01011010110 << 21)
|
(0b01011010110 << 21)
|
||||||
| size << 31
|
| size << 31
|
||||||
@@ -2173,6 +2197,34 @@ impl MachInstEmit for Inst {
|
|||||||
|
|
||||||
sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
|
sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
|
||||||
}
|
}
|
||||||
|
&Inst::VecRRRLong {
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
rm,
|
||||||
|
alu_op,
|
||||||
|
high_half,
|
||||||
|
} => {
|
||||||
|
let (u, size, bit14) = match alu_op {
|
||||||
|
VecRRRLongOp::Smull8 => (0b0, 0b00, 0b1),
|
||||||
|
VecRRRLongOp::Smull16 => (0b0, 0b01, 0b1),
|
||||||
|
VecRRRLongOp::Smull32 => (0b0, 0b10, 0b1),
|
||||||
|
VecRRRLongOp::Umull8 => (0b1, 0b00, 0b1),
|
||||||
|
VecRRRLongOp::Umull16 => (0b1, 0b01, 0b1),
|
||||||
|
VecRRRLongOp::Umull32 => (0b1, 0b10, 0b1),
|
||||||
|
VecRRRLongOp::Umlal8 => (0b1, 0b00, 0b0),
|
||||||
|
VecRRRLongOp::Umlal16 => (0b1, 0b01, 0b0),
|
||||||
|
VecRRRLongOp::Umlal32 => (0b1, 0b10, 0b0),
|
||||||
|
};
|
||||||
|
sink.put4(enc_vec_rrr_long(
|
||||||
|
high_half as u32,
|
||||||
|
u,
|
||||||
|
size,
|
||||||
|
bit14,
|
||||||
|
rm,
|
||||||
|
rn,
|
||||||
|
rd,
|
||||||
|
));
|
||||||
|
}
|
||||||
&Inst::VecRRR {
|
&Inst::VecRRR {
|
||||||
rd,
|
rd,
|
||||||
rn,
|
rn,
|
||||||
@@ -2242,13 +2294,7 @@ impl MachInstEmit for Inst {
|
|||||||
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
|
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
|
||||||
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
|
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
|
||||||
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
|
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
|
||||||
VecALUOp::Umlal => {
|
|
||||||
debug_assert!(!size.is_128bits());
|
|
||||||
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
|
||||||
}
|
|
||||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||||
VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
|
|
||||||
VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
|
|
||||||
VecALUOp::Sqrdmulh => {
|
VecALUOp::Sqrdmulh => {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
size.lane_size() == ScalarSize::Size16
|
size.lane_size() == ScalarSize::Size16
|
||||||
@@ -2258,12 +2304,12 @@ impl MachInstEmit for Inst {
|
|||||||
(0b001_01110_00_1 | enc_size << 1, 0b101101)
|
(0b001_01110_00_1 | enc_size << 1, 0b101101)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let top11 = match alu_op {
|
let top11 = if is_float {
|
||||||
VecALUOp::Smull | VecALUOp::Smull2 => top11,
|
top11 | enc_float_size << 1
|
||||||
_ if is_float => top11 | (q << 9) | enc_float_size << 1,
|
} else {
|
||||||
_ => top11 | (q << 9),
|
top11
|
||||||
};
|
};
|
||||||
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
|
||||||
}
|
}
|
||||||
&Inst::VecLoadReplicate { rd, rn, size } => {
|
&Inst::VecLoadReplicate { rd, rn, size } => {
|
||||||
let (q, size) = size.enc_size();
|
let (q, size) = size.enc_size();
|
||||||
|
|||||||
@@ -3651,18 +3651,6 @@ fn test_aarch64_binemit() {
|
|||||||
"addp v8.4s, v12.4s, v14.4s",
|
"addp v8.4s, v12.4s, v14.4s",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
|
||||||
Inst::VecRRR {
|
|
||||||
alu_op: VecALUOp::Umlal,
|
|
||||||
rd: writable_vreg(9),
|
|
||||||
rn: vreg(20),
|
|
||||||
rm: vreg(17),
|
|
||||||
size: VectorSize::Size32x2,
|
|
||||||
},
|
|
||||||
"8982B12E",
|
|
||||||
"umlal v9.2d, v20.2s, v17.2s",
|
|
||||||
));
|
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRR {
|
||||||
alu_op: VecALUOp::Zip1,
|
alu_op: VecALUOp::Zip1,
|
||||||
@@ -3712,77 +3700,221 @@ fn test_aarch64_binemit() {
|
|||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull,
|
alu_op: VecRRRLongOp::Smull8,
|
||||||
rd: writable_vreg(16),
|
rd: writable_vreg(16),
|
||||||
rn: vreg(12),
|
rn: vreg(12),
|
||||||
rm: vreg(1),
|
rm: vreg(1),
|
||||||
size: VectorSize::Size8x16,
|
high_half: false,
|
||||||
},
|
},
|
||||||
"90C1210E",
|
"90C1210E",
|
||||||
"smull v16.8h, v12.8b, v1.8b",
|
"smull v16.8h, v12.8b, v1.8b",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull,
|
alu_op: VecRRRLongOp::Umull8,
|
||||||
|
rd: writable_vreg(15),
|
||||||
|
rn: vreg(11),
|
||||||
|
rm: vreg(2),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"6FC1222E",
|
||||||
|
"umull v15.8h, v11.8b, v2.8b",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal8,
|
||||||
|
rd: writable_vreg(4),
|
||||||
|
rn: vreg(8),
|
||||||
|
rm: vreg(16),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"0481302E",
|
||||||
|
"umlal v4.8h, v8.8b, v16.8b",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Smull16,
|
||||||
rd: writable_vreg(2),
|
rd: writable_vreg(2),
|
||||||
rn: vreg(13),
|
rn: vreg(13),
|
||||||
rm: vreg(6),
|
rm: vreg(6),
|
||||||
size: VectorSize::Size16x8,
|
high_half: false,
|
||||||
},
|
},
|
||||||
"A2C1660E",
|
"A2C1660E",
|
||||||
"smull v2.4s, v13.4h, v6.4h",
|
"smull v2.4s, v13.4h, v6.4h",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull,
|
alu_op: VecRRRLongOp::Umull16,
|
||||||
|
rd: writable_vreg(3),
|
||||||
|
rn: vreg(14),
|
||||||
|
rm: vreg(7),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"C3C1672E",
|
||||||
|
"umull v3.4s, v14.4h, v7.4h",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal16,
|
||||||
|
rd: writable_vreg(7),
|
||||||
|
rn: vreg(14),
|
||||||
|
rm: vreg(21),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"C781752E",
|
||||||
|
"umlal v7.4s, v14.4h, v21.4h",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Smull32,
|
||||||
rd: writable_vreg(8),
|
rd: writable_vreg(8),
|
||||||
rn: vreg(12),
|
rn: vreg(12),
|
||||||
rm: vreg(14),
|
rm: vreg(14),
|
||||||
size: VectorSize::Size32x4,
|
high_half: false,
|
||||||
},
|
},
|
||||||
"88C1AE0E",
|
"88C1AE0E",
|
||||||
"smull v8.2d, v12.2s, v14.2s",
|
"smull v8.2d, v12.2s, v14.2s",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull2,
|
alu_op: VecRRRLongOp::Umull32,
|
||||||
|
rd: writable_vreg(9),
|
||||||
|
rn: vreg(5),
|
||||||
|
rm: vreg(6),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"A9C0A62E",
|
||||||
|
"umull v9.2d, v5.2s, v6.2s",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal32,
|
||||||
|
rd: writable_vreg(9),
|
||||||
|
rn: vreg(20),
|
||||||
|
rm: vreg(17),
|
||||||
|
high_half: false,
|
||||||
|
},
|
||||||
|
"8982B12E",
|
||||||
|
"umlal v9.2d, v20.2s, v17.2s",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Smull8,
|
||||||
rd: writable_vreg(16),
|
rd: writable_vreg(16),
|
||||||
rn: vreg(12),
|
rn: vreg(12),
|
||||||
rm: vreg(1),
|
rm: vreg(1),
|
||||||
size: VectorSize::Size8x16,
|
high_half: true,
|
||||||
},
|
},
|
||||||
"90C1214E",
|
"90C1214E",
|
||||||
"smull2 v16.8h, v12.16b, v1.16b",
|
"smull2 v16.8h, v12.16b, v1.16b",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull2,
|
alu_op: VecRRRLongOp::Umull8,
|
||||||
|
rd: writable_vreg(29),
|
||||||
|
rn: vreg(22),
|
||||||
|
rm: vreg(10),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"DDC22A6E",
|
||||||
|
"umull2 v29.8h, v22.16b, v10.16b",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal8,
|
||||||
|
rd: writable_vreg(1),
|
||||||
|
rn: vreg(5),
|
||||||
|
rm: vreg(15),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"A1802F6E",
|
||||||
|
"umlal2 v1.8h, v5.16b, v15.16b",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Smull16,
|
||||||
rd: writable_vreg(2),
|
rd: writable_vreg(2),
|
||||||
rn: vreg(13),
|
rn: vreg(13),
|
||||||
rm: vreg(6),
|
rm: vreg(6),
|
||||||
size: VectorSize::Size16x8,
|
high_half: true,
|
||||||
},
|
},
|
||||||
"A2C1664E",
|
"A2C1664E",
|
||||||
"smull2 v2.4s, v13.8h, v6.8h",
|
"smull2 v2.4s, v13.8h, v6.8h",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull2,
|
alu_op: VecRRRLongOp::Umull16,
|
||||||
|
rd: writable_vreg(19),
|
||||||
|
rn: vreg(18),
|
||||||
|
rm: vreg(17),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"53C2716E",
|
||||||
|
"umull2 v19.4s, v18.8h, v17.8h",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal16,
|
||||||
|
rd: writable_vreg(11),
|
||||||
|
rn: vreg(10),
|
||||||
|
rm: vreg(12),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"4B816C6E",
|
||||||
|
"umlal2 v11.4s, v10.8h, v12.8h",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Smull32,
|
||||||
rd: writable_vreg(8),
|
rd: writable_vreg(8),
|
||||||
rn: vreg(12),
|
rn: vreg(12),
|
||||||
rm: vreg(14),
|
rm: vreg(14),
|
||||||
size: VectorSize::Size32x4,
|
high_half: true,
|
||||||
},
|
},
|
||||||
"88C1AE4E",
|
"88C1AE4E",
|
||||||
"smull2 v8.2d, v12.4s, v14.4s",
|
"smull2 v8.2d, v12.4s, v14.4s",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umull32,
|
||||||
|
rd: writable_vreg(4),
|
||||||
|
rn: vreg(12),
|
||||||
|
rm: vreg(16),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"84C1B06E",
|
||||||
|
"umull2 v4.2d, v12.4s, v16.4s",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal32,
|
||||||
|
rd: writable_vreg(10),
|
||||||
|
rn: vreg(29),
|
||||||
|
rm: vreg(2),
|
||||||
|
high_half: true,
|
||||||
|
},
|
||||||
|
"AA83A26E",
|
||||||
|
"umlal2 v10.2d, v29.4s, v2.4s",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecRRR {
|
Inst::VecRRR {
|
||||||
alu_op: VecALUOp::Sqrdmulh,
|
alu_op: VecALUOp::Sqrdmulh,
|
||||||
|
|||||||
@@ -303,14 +303,8 @@ pub enum VecALUOp {
|
|||||||
Fmul,
|
Fmul,
|
||||||
/// Add pairwise
|
/// Add pairwise
|
||||||
Addp,
|
Addp,
|
||||||
/// Unsigned multiply add long
|
|
||||||
Umlal,
|
|
||||||
/// Zip vectors (primary) [meaning, high halves]
|
/// Zip vectors (primary) [meaning, high halves]
|
||||||
Zip1,
|
Zip1,
|
||||||
/// Signed multiply long (low halves)
|
|
||||||
Smull,
|
|
||||||
/// Signed multiply long (high halves)
|
|
||||||
Smull2,
|
|
||||||
/// Signed saturating rounding doubling multiply returning high half
|
/// Signed saturating rounding doubling multiply returning high half
|
||||||
Sqrdmulh,
|
Sqrdmulh,
|
||||||
}
|
}
|
||||||
@@ -402,6 +396,22 @@ pub enum VecRRNarrowOp {
|
|||||||
Fcvtn64,
|
Fcvtn64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||||
|
pub enum VecRRRLongOp {
|
||||||
|
/// Signed multiply long.
|
||||||
|
Smull8,
|
||||||
|
Smull16,
|
||||||
|
Smull32,
|
||||||
|
/// Unsigned multiply long.
|
||||||
|
Umull8,
|
||||||
|
Umull16,
|
||||||
|
Umull32,
|
||||||
|
/// Unsigned multiply add long
|
||||||
|
Umlal8,
|
||||||
|
Umlal16,
|
||||||
|
Umlal32,
|
||||||
|
}
|
||||||
|
|
||||||
/// A vector operation on a pair of elements with one register.
|
/// A vector operation on a pair of elements with one register.
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||||
pub enum VecPairOp {
|
pub enum VecPairOp {
|
||||||
@@ -1087,6 +1097,16 @@ pub enum Inst {
|
|||||||
rn: Reg,
|
rn: Reg,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// 2-operand vector instruction that produces a result with twice the
|
||||||
|
/// lane width and half the number of lanes.
|
||||||
|
VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp,
|
||||||
|
rd: Writable<Reg>,
|
||||||
|
rn: Reg,
|
||||||
|
rm: Reg,
|
||||||
|
high_half: bool,
|
||||||
|
},
|
||||||
|
|
||||||
/// A vector ALU op.
|
/// A vector ALU op.
|
||||||
VecRRR {
|
VecRRR {
|
||||||
alu_op: VecALUOp,
|
alu_op: VecALUOp,
|
||||||
@@ -2134,10 +2154,22 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
collector.add_use(rn);
|
collector.add_use(rn);
|
||||||
}
|
}
|
||||||
|
&Inst::VecRRRLong {
|
||||||
|
alu_op, rd, rn, rm, ..
|
||||||
|
} => {
|
||||||
|
match alu_op {
|
||||||
|
VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => {
|
||||||
|
collector.add_mod(rd)
|
||||||
|
}
|
||||||
|
_ => collector.add_def(rd),
|
||||||
|
};
|
||||||
|
collector.add_use(rn);
|
||||||
|
collector.add_use(rm);
|
||||||
|
}
|
||||||
&Inst::VecRRR {
|
&Inst::VecRRR {
|
||||||
alu_op, rd, rn, rm, ..
|
alu_op, rd, rn, rm, ..
|
||||||
} => {
|
} => {
|
||||||
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
|
if alu_op == VecALUOp::Bsl {
|
||||||
collector.add_mod(rd);
|
collector.add_mod(rd);
|
||||||
} else {
|
} else {
|
||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
@@ -2944,6 +2976,22 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
map_def(mapper, rd);
|
map_def(mapper, rd);
|
||||||
map_use(mapper, rn);
|
map_use(mapper, rn);
|
||||||
}
|
}
|
||||||
|
&mut Inst::VecRRRLong {
|
||||||
|
alu_op,
|
||||||
|
ref mut rd,
|
||||||
|
ref mut rn,
|
||||||
|
ref mut rm,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
match alu_op {
|
||||||
|
VecRRRLongOp::Umlal8 | VecRRRLongOp::Umlal16 | VecRRRLongOp::Umlal32 => {
|
||||||
|
map_mod(mapper, rd)
|
||||||
|
}
|
||||||
|
_ => map_def(mapper, rd),
|
||||||
|
};
|
||||||
|
map_use(mapper, rn);
|
||||||
|
map_use(mapper, rm);
|
||||||
|
}
|
||||||
&mut Inst::VecRRR {
|
&mut Inst::VecRRR {
|
||||||
alu_op,
|
alu_op,
|
||||||
ref mut rd,
|
ref mut rd,
|
||||||
@@ -2951,7 +2999,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
ref mut rm,
|
ref mut rm,
|
||||||
..
|
..
|
||||||
} => {
|
} => {
|
||||||
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Umlal {
|
if alu_op == VecALUOp::Bsl {
|
||||||
map_mod(mapper, rd);
|
map_mod(mapper, rd);
|
||||||
} else {
|
} else {
|
||||||
map_def(mapper, rd);
|
map_def(mapper, rd);
|
||||||
@@ -4147,24 +4195,80 @@ impl Inst {
|
|||||||
VecALUOp::Fmin => ("fmin", size),
|
VecALUOp::Fmin => ("fmin", size),
|
||||||
VecALUOp::Fmul => ("fmul", size),
|
VecALUOp::Fmul => ("fmul", size),
|
||||||
VecALUOp::Addp => ("addp", size),
|
VecALUOp::Addp => ("addp", size),
|
||||||
VecALUOp::Umlal => ("umlal", size),
|
|
||||||
VecALUOp::Zip1 => ("zip1", size),
|
VecALUOp::Zip1 => ("zip1", size),
|
||||||
VecALUOp::Smull => ("smull", size),
|
|
||||||
VecALUOp::Smull2 => ("smull2", size),
|
|
||||||
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
|
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
|
||||||
};
|
};
|
||||||
let rd_size = match alu_op {
|
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
|
||||||
VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
|
let rn = show_vreg_vector(rn, mb_rru, size);
|
||||||
_ => size,
|
let rm = show_vreg_vector(rm, mb_rru, size);
|
||||||
|
format!("{} {}, {}, {}", op, rd, rn, rm)
|
||||||
|
}
|
||||||
|
&Inst::VecRRRLong {
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
rm,
|
||||||
|
alu_op,
|
||||||
|
high_half,
|
||||||
|
} => {
|
||||||
|
let (op, dest_size, src_size) = match (alu_op, high_half) {
|
||||||
|
(VecRRRLongOp::Smull8, false) => {
|
||||||
|
("smull", VectorSize::Size16x8, VectorSize::Size8x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Smull8, true) => {
|
||||||
|
("smull2", VectorSize::Size16x8, VectorSize::Size8x16)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Smull16, false) => {
|
||||||
|
("smull", VectorSize::Size32x4, VectorSize::Size16x4)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Smull16, true) => {
|
||||||
|
("smull2", VectorSize::Size32x4, VectorSize::Size16x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Smull32, false) => {
|
||||||
|
("smull", VectorSize::Size64x2, VectorSize::Size32x2)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Smull32, true) => {
|
||||||
|
("smull2", VectorSize::Size64x2, VectorSize::Size32x4)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull8, false) => {
|
||||||
|
("umull", VectorSize::Size16x8, VectorSize::Size8x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull8, true) => {
|
||||||
|
("umull2", VectorSize::Size16x8, VectorSize::Size8x16)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull16, false) => {
|
||||||
|
("umull", VectorSize::Size32x4, VectorSize::Size16x4)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull16, true) => {
|
||||||
|
("umull2", VectorSize::Size32x4, VectorSize::Size16x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull32, false) => {
|
||||||
|
("umull", VectorSize::Size64x2, VectorSize::Size32x2)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umull32, true) => {
|
||||||
|
("umull2", VectorSize::Size64x2, VectorSize::Size32x4)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal8, false) => {
|
||||||
|
("umlal", VectorSize::Size16x8, VectorSize::Size8x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal8, true) => {
|
||||||
|
("umlal2", VectorSize::Size16x8, VectorSize::Size8x16)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal16, false) => {
|
||||||
|
("umlal", VectorSize::Size32x4, VectorSize::Size16x4)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal16, true) => {
|
||||||
|
("umlal2", VectorSize::Size32x4, VectorSize::Size16x8)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal32, false) => {
|
||||||
|
("umlal", VectorSize::Size64x2, VectorSize::Size32x2)
|
||||||
|
}
|
||||||
|
(VecRRRLongOp::Umlal32, true) => {
|
||||||
|
("umlal2", VectorSize::Size64x2, VectorSize::Size32x4)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let rn_size = match alu_op {
|
let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest_size);
|
||||||
VecALUOp::Smull => size.halve(),
|
let rn = show_vreg_vector(rn, mb_rru, src_size);
|
||||||
_ => size,
|
let rm = show_vreg_vector(rm, mb_rru, src_size);
|
||||||
};
|
|
||||||
let rm_size = rn_size;
|
|
||||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
|
|
||||||
let rn = show_vreg_vector(rn, mb_rru, rn_size);
|
|
||||||
let rm = show_vreg_vector(rm, mb_rru, rm_size);
|
|
||||||
format!("{} {}, {}, {}", op, rd, rn, rm)
|
format!("{} {}, {}, {}", op, rd, rn, rm)
|
||||||
}
|
}
|
||||||
&Inst::VecMisc { op, rd, rn, size } => {
|
&Inst::VecMisc { op, rd, rn, size } => {
|
||||||
|
|||||||
@@ -1253,6 +1253,153 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Pattern match an extending vector multiplication.
|
||||||
|
/// Returns a tuple of the opcode to use, the two input registers and whether
|
||||||
|
/// it's the 'high half' version of the instruction.
|
||||||
|
pub(crate) fn match_vec_long_mul<C: LowerCtx<I = Inst>>(
|
||||||
|
c: &mut C,
|
||||||
|
insn: IRInst,
|
||||||
|
ext_op: Opcode,
|
||||||
|
) -> Option<(VecRRRLongOp, regalloc::Reg, regalloc::Reg, bool)> {
|
||||||
|
let inputs = insn_inputs(c, insn);
|
||||||
|
if let Some(lhs) = maybe_input_insn(c, inputs[0], ext_op) {
|
||||||
|
if let Some(rhs) = maybe_input_insn(c, inputs[1], ext_op) {
|
||||||
|
let lhs_input = insn_inputs(c, lhs)[0];
|
||||||
|
let rhs_input = insn_inputs(c, rhs)[0];
|
||||||
|
let rn = put_input_in_reg(c, lhs_input, NarrowValueMode::None);
|
||||||
|
let rm = put_input_in_reg(c, rhs_input, NarrowValueMode::None);
|
||||||
|
let lane_type = c.output_ty(insn, 0).lane_type();
|
||||||
|
match (lane_type, ext_op) {
|
||||||
|
(I16, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull8, rn, rm, false)),
|
||||||
|
(I16, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull8, rn, rm, true)),
|
||||||
|
(I16, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull8, rn, rm, false)),
|
||||||
|
(I16, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull8, rn, rm, true)),
|
||||||
|
(I32, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull16, rn, rm, false)),
|
||||||
|
(I32, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull16, rn, rm, true)),
|
||||||
|
(I32, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull16, rn, rm, false)),
|
||||||
|
(I32, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull16, rn, rm, true)),
|
||||||
|
(I64, Opcode::SwidenLow) => return Some((VecRRRLongOp::Smull32, rn, rm, false)),
|
||||||
|
(I64, Opcode::SwidenHigh) => return Some((VecRRRLongOp::Smull32, rn, rm, true)),
|
||||||
|
(I64, Opcode::UwidenLow) => return Some((VecRRRLongOp::Umull32, rn, rm, false)),
|
||||||
|
(I64, Opcode::UwidenHigh) => return Some((VecRRRLongOp::Umull32, rn, rm, true)),
|
||||||
|
_ => {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn lower_i64x2_mul<C: LowerCtx<I = Inst>>(c: &mut C, insn: IRInst) {
|
||||||
|
let inputs = insn_inputs(c, insn);
|
||||||
|
let outputs = insn_outputs(c, insn);
|
||||||
|
let rd = get_output_reg(c, outputs[0]).regs()[0];
|
||||||
|
let rn = put_input_in_regs(c, inputs[0]).regs()[0];
|
||||||
|
let rm = put_input_in_regs(c, inputs[1]).regs()[0];
|
||||||
|
|
||||||
|
let tmp1 = c.alloc_tmp(I64X2).only_reg().unwrap();
|
||||||
|
let tmp2 = c.alloc_tmp(I64X2).only_reg().unwrap();
|
||||||
|
|
||||||
|
// This I64X2 multiplication is performed with several 32-bit
|
||||||
|
// operations.
|
||||||
|
|
||||||
|
// 64-bit numbers x and y, can be represented as:
|
||||||
|
// x = a + 2^32(b)
|
||||||
|
// y = c + 2^32(d)
|
||||||
|
|
||||||
|
// A 64-bit multiplication is:
|
||||||
|
// x * y = ac + 2^32(ad + bc) + 2^64(bd)
|
||||||
|
// note: `2^64(bd)` can be ignored, the value is too large to fit in
|
||||||
|
// 64 bits.
|
||||||
|
|
||||||
|
// This sequence implements a I64X2 multiply, where the registers
|
||||||
|
// `rn` and `rm` are split up into 32-bit components:
|
||||||
|
// rn = |d|c|b|a|
|
||||||
|
// rm = |h|g|f|e|
|
||||||
|
//
|
||||||
|
// rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
|
||||||
|
//
|
||||||
|
// The sequence is:
|
||||||
|
// rev64 rd.4s, rm.4s
|
||||||
|
// mul rd.4s, rd.4s, rn.4s
|
||||||
|
// xtn tmp1.2s, rn.2d
|
||||||
|
// addp rd.4s, rd.4s, rd.4s
|
||||||
|
// xtn tmp2.2s, rm.2d
|
||||||
|
// shll rd.2d, rd.2s, #32
|
||||||
|
// umlal rd.2d, tmp2.2s, tmp1.2s
|
||||||
|
|
||||||
|
// Reverse the 32-bit elements in the 64-bit words.
|
||||||
|
// rd = |g|h|e|f|
|
||||||
|
c.emit(Inst::VecMisc {
|
||||||
|
op: VecMisc2::Rev64,
|
||||||
|
rd,
|
||||||
|
rn: rm,
|
||||||
|
size: VectorSize::Size32x4,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Calculate the high half components.
|
||||||
|
// rd = |dg|ch|be|af|
|
||||||
|
//
|
||||||
|
// Note that this 32-bit multiply of the high half
|
||||||
|
// discards the bits that would overflow, same as
|
||||||
|
// if 64-bit operations were used. Also the Shll
|
||||||
|
// below would shift out the overflow bits anyway.
|
||||||
|
c.emit(Inst::VecRRR {
|
||||||
|
alu_op: VecALUOp::Mul,
|
||||||
|
rd,
|
||||||
|
rn: rd.to_reg(),
|
||||||
|
rm: rn,
|
||||||
|
size: VectorSize::Size32x4,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract the low half components of rn.
|
||||||
|
// tmp1 = |c|a|
|
||||||
|
c.emit(Inst::VecRRNarrow {
|
||||||
|
op: VecRRNarrowOp::Xtn64,
|
||||||
|
rd: tmp1,
|
||||||
|
rn,
|
||||||
|
high_half: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Sum the respective high half components.
|
||||||
|
// rd = |dg+ch|be+af||dg+ch|be+af|
|
||||||
|
c.emit(Inst::VecRRR {
|
||||||
|
alu_op: VecALUOp::Addp,
|
||||||
|
rd: rd,
|
||||||
|
rn: rd.to_reg(),
|
||||||
|
rm: rd.to_reg(),
|
||||||
|
size: VectorSize::Size32x4,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Extract the low half components of rm.
|
||||||
|
// tmp2 = |g|e|
|
||||||
|
c.emit(Inst::VecRRNarrow {
|
||||||
|
op: VecRRNarrowOp::Xtn64,
|
||||||
|
rd: tmp2,
|
||||||
|
rn: rm,
|
||||||
|
high_half: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Shift the high half components, into the high half.
|
||||||
|
// rd = |dg+ch << 32|be+af << 32|
|
||||||
|
c.emit(Inst::VecRRLong {
|
||||||
|
op: VecRRLongOp::Shll32,
|
||||||
|
rd,
|
||||||
|
rn: rd.to_reg(),
|
||||||
|
high_half: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Multiply the low components together, and accumulate with the high
|
||||||
|
// half.
|
||||||
|
// rd = |rd[1] + cg|rd[0] + ae|
|
||||||
|
c.emit(Inst::VecRRRLong {
|
||||||
|
alu_op: VecRRRLongOp::Umlal32,
|
||||||
|
rd,
|
||||||
|
rn: tmp2.to_reg(),
|
||||||
|
rm: tmp1.to_reg(),
|
||||||
|
high_half: false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Specifies what [lower_icmp] should do when lowering
|
/// Specifies what [lower_icmp] should do when lowering
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub(crate) enum IcmpOutput {
|
pub(crate) enum IcmpOutput {
|
||||||
|
|||||||
@@ -244,17 +244,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Imul => {
|
Opcode::Imul => {
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
if ty == I128 {
|
||||||
let lhs = put_input_in_regs(ctx, inputs[0]);
|
let lhs = put_input_in_regs(ctx, inputs[0]);
|
||||||
let rhs = put_input_in_regs(ctx, inputs[1]);
|
let rhs = put_input_in_regs(ctx, inputs[1]);
|
||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
let rd = dst.regs()[0];
|
|
||||||
let rn = lhs.regs()[0];
|
|
||||||
let rm = rhs.regs()[0];
|
|
||||||
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
match ty {
|
|
||||||
I128 => {
|
|
||||||
assert_eq!(lhs.len(), 2);
|
assert_eq!(lhs.len(), 2);
|
||||||
assert_eq!(rhs.len(), 2);
|
assert_eq!(rhs.len(), 2);
|
||||||
assert_eq!(dst.len(), 2);
|
assert_eq!(dst.len(), 2);
|
||||||
@@ -296,122 +290,33 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
rm: rhs.regs()[0],
|
rm: rhs.regs()[0],
|
||||||
ra: zero_reg(),
|
ra: zero_reg(),
|
||||||
});
|
});
|
||||||
}
|
} else if ty.is_vector() {
|
||||||
ty if !ty.is_vector() => {
|
for ext_op in &[
|
||||||
let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
|
Opcode::SwidenLow,
|
||||||
ctx.emit(Inst::AluRRRR {
|
Opcode::SwidenHigh,
|
||||||
|
Opcode::UwidenLow,
|
||||||
|
Opcode::UwidenHigh,
|
||||||
|
] {
|
||||||
|
if let Some((alu_op, rn, rm, high_half)) =
|
||||||
|
match_vec_long_mul(ctx, insn, *ext_op)
|
||||||
|
{
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::VecRRRLong {
|
||||||
alu_op,
|
alu_op,
|
||||||
rd,
|
rd,
|
||||||
rn,
|
rn,
|
||||||
rm,
|
rm,
|
||||||
ra: zero_reg(),
|
high_half,
|
||||||
});
|
});
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
I64X2 => {
|
|
||||||
let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
|
|
||||||
let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
|
|
||||||
|
|
||||||
// This I64X2 multiplication is performed with several 32-bit
|
|
||||||
// operations.
|
|
||||||
|
|
||||||
// 64-bit numbers x and y, can be represented as:
|
|
||||||
// x = a + 2^32(b)
|
|
||||||
// y = c + 2^32(d)
|
|
||||||
|
|
||||||
// A 64-bit multiplication is:
|
|
||||||
// x * y = ac + 2^32(ad + bc) + 2^64(bd)
|
|
||||||
// note: `2^64(bd)` can be ignored, the value is too large to fit in
|
|
||||||
// 64 bits.
|
|
||||||
|
|
||||||
// This sequence implements a I64X2 multiply, where the registers
|
|
||||||
// `rn` and `rm` are split up into 32-bit components:
|
|
||||||
// rn = |d|c|b|a|
|
|
||||||
// rm = |h|g|f|e|
|
|
||||||
//
|
|
||||||
// rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
|
|
||||||
//
|
|
||||||
// The sequence is:
|
|
||||||
// rev64 rd.4s, rm.4s
|
|
||||||
// mul rd.4s, rd.4s, rn.4s
|
|
||||||
// xtn tmp1.2s, rn.2d
|
|
||||||
// addp rd.4s, rd.4s, rd.4s
|
|
||||||
// xtn tmp2.2s, rm.2d
|
|
||||||
// shll rd.2d, rd.2s, #32
|
|
||||||
// umlal rd.2d, tmp2.2s, tmp1.2s
|
|
||||||
|
|
||||||
// Reverse the 32-bit elements in the 64-bit words.
|
|
||||||
// rd = |g|h|e|f|
|
|
||||||
ctx.emit(Inst::VecMisc {
|
|
||||||
op: VecMisc2::Rev64,
|
|
||||||
rd,
|
|
||||||
rn: rm,
|
|
||||||
size: VectorSize::Size32x4,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Calculate the high half components.
|
|
||||||
// rd = |dg|ch|be|af|
|
|
||||||
//
|
|
||||||
// Note that this 32-bit multiply of the high half
|
|
||||||
// discards the bits that would overflow, same as
|
|
||||||
// if 64-bit operations were used. Also the Shll
|
|
||||||
// below would shift out the overflow bits anyway.
|
|
||||||
ctx.emit(Inst::VecRRR {
|
|
||||||
alu_op: VecALUOp::Mul,
|
|
||||||
rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
rm: rn,
|
|
||||||
size: VectorSize::Size32x4,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Extract the low half components of rn.
|
|
||||||
// tmp1 = |c|a|
|
|
||||||
ctx.emit(Inst::VecRRNarrow {
|
|
||||||
op: VecRRNarrowOp::Xtn64,
|
|
||||||
rd: tmp1,
|
|
||||||
rn,
|
|
||||||
high_half: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Sum the respective high half components.
|
|
||||||
// rd = |dg+ch|be+af||dg+ch|be+af|
|
|
||||||
ctx.emit(Inst::VecRRR {
|
|
||||||
alu_op: VecALUOp::Addp,
|
|
||||||
rd: rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
rm: rd.to_reg(),
|
|
||||||
size: VectorSize::Size32x4,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Extract the low half components of rm.
|
|
||||||
// tmp2 = |g|e|
|
|
||||||
ctx.emit(Inst::VecRRNarrow {
|
|
||||||
op: VecRRNarrowOp::Xtn64,
|
|
||||||
rd: tmp2,
|
|
||||||
rn: rm,
|
|
||||||
high_half: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Shift the high half components, into the high half.
|
|
||||||
// rd = |dg+ch << 32|be+af << 32|
|
|
||||||
ctx.emit(Inst::VecRRLong {
|
|
||||||
op: VecRRLongOp::Shll32,
|
|
||||||
rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
high_half: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Multiply the low components together, and accumulate with the high
|
|
||||||
// half.
|
|
||||||
// rd = |rd[1] + cg|rd[0] + ae|
|
|
||||||
ctx.emit(Inst::VecRRR {
|
|
||||||
alu_op: VecALUOp::Umlal,
|
|
||||||
rd,
|
|
||||||
rn: tmp2.to_reg(),
|
|
||||||
rm: tmp1.to_reg(),
|
|
||||||
size: VectorSize::Size32x2,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
ty if ty.is_vector() => {
|
if ty == I64X2 {
|
||||||
|
lower_i64x2_mul(ctx, insn);
|
||||||
|
} else {
|
||||||
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
ctx.emit(Inst::VecRRR {
|
ctx.emit(Inst::VecRRR {
|
||||||
alu_op: VecALUOp::Mul,
|
alu_op: VecALUOp::Mul,
|
||||||
rd,
|
rd,
|
||||||
@@ -420,7 +325,18 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
size: VectorSize::from_ty(ty),
|
size: VectorSize::from_ty(ty),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
_ => panic!("Unable to emit mul for {}", ty),
|
} else {
|
||||||
|
let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
|
||||||
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::AluRRRR {
|
||||||
|
alu_op,
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
rm,
|
||||||
|
ra: zero_reg(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2740,19 +2656,19 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// => smull tmp, a, b
|
// => smull tmp, a, b
|
||||||
// smull2 y, a, b
|
// smull2 y, a, b
|
||||||
// addp y, tmp, y
|
// addp y, tmp, y
|
||||||
ctx.emit(Inst::VecRRR {
|
ctx.emit(Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull,
|
alu_op: VecRRRLongOp::Smull16,
|
||||||
rd: tmp,
|
rd: tmp,
|
||||||
rn: r_a,
|
rn: r_a,
|
||||||
rm: r_b,
|
rm: r_b,
|
||||||
size: VectorSize::Size16x8,
|
high_half: false,
|
||||||
});
|
});
|
||||||
ctx.emit(Inst::VecRRR {
|
ctx.emit(Inst::VecRRRLong {
|
||||||
alu_op: VecALUOp::Smull2,
|
alu_op: VecRRRLongOp::Smull16,
|
||||||
rd: r_y,
|
rd: r_y,
|
||||||
rn: r_a,
|
rn: r_a,
|
||||||
rm: r_b,
|
rm: r_b,
|
||||||
size: VectorSize::Size16x8,
|
high_half: true,
|
||||||
});
|
});
|
||||||
ctx.emit(Inst::VecRRR {
|
ctx.emit(Inst::VecRRR {
|
||||||
alu_op: VecALUOp::Addp,
|
alu_op: VecALUOp::Addp,
|
||||||
|
|||||||
159
cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif
Normal file
159
cranelift/filetests/filetests/isa/aarch64/simd-extmul.clif
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
test compile
|
||||||
|
set unwind_info=false
|
||||||
|
target aarch64
|
||||||
|
|
||||||
|
function %fn1(i8x16, i8x16) -> i16x8 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = swiden_low v0
|
||||||
|
v3 = swiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull v0.8h, v0.8b, v1.8b
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn2(i8x16, i8x16) -> i16x8 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = swiden_high v0
|
||||||
|
v3 = swiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull2 v0.8h, v0.16b, v1.16b
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn3(i16x8, i16x8) -> i32x4 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = swiden_low v0
|
||||||
|
v3 = swiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull v0.4s, v0.4h, v1.4h
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn4(i16x8, i16x8) -> i32x4 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = swiden_high v0
|
||||||
|
v3 = swiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull2 v0.4s, v0.8h, v1.8h
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn5(i32x4, i32x4) -> i64x2 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = swiden_low v0
|
||||||
|
v3 = swiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull v0.2d, v0.2s, v1.2s
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn6(i32x4, i32x4) -> i64x2 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = swiden_high v0
|
||||||
|
v3 = swiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: sxtl
|
||||||
|
; check: smull2 v0.2d, v0.4s, v1.4s
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn7(i8x16, i8x16) -> i16x8 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = uwiden_low v0
|
||||||
|
v3 = uwiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl
|
||||||
|
; check: umull v0.8h, v0.8b, v1.8b
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn8(i8x16, i8x16) -> i16x8 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = uwiden_high v0
|
||||||
|
v3 = uwiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl
|
||||||
|
; check: umull2 v0.8h, v0.16b, v1.16b
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn9(i16x8, i16x8) -> i32x4 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = uwiden_low v0
|
||||||
|
v3 = uwiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl
|
||||||
|
; check: umull v0.4s, v0.4h, v1.4h
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn10(i16x8, i16x8) -> i32x4 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = uwiden_high v0
|
||||||
|
v3 = uwiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl
|
||||||
|
; check: umull2 v0.4s, v0.8h, v1.8h
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn11(i32x4, i32x4) -> i64x2 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = uwiden_low v0
|
||||||
|
v3 = uwiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl
|
||||||
|
; check: umull v0.2d, v0.2s, v1.2s
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %fn12(i32x4, i32x4) -> i64x2 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = uwiden_high v0
|
||||||
|
v3 = uwiden_high v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check-not: uxtl2
|
||||||
|
; check: umull2 v0.2d, v0.4s, v1.4s
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
@@ -1908,7 +1908,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
}
|
}
|
||||||
Operator::I16x8Q15MulrSatS => {
|
Operator::I16x8Q15MulrSatS => {
|
||||||
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
|
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
|
||||||
|
|
||||||
state.push1(builder.ins().sqmul_round_sat(a, b))
|
state.push1(builder.ins().sqmul_round_sat(a, b))
|
||||||
}
|
}
|
||||||
Operator::I16x8ExtMulLowI8x16S => {
|
Operator::I16x8ExtMulLowI8x16S => {
|
||||||
|
|||||||
Reference in New Issue
Block a user