Enable simd_X_extadd_pairwise_X for AArch64
Lower to [u|s]addlp for AArch64. Copyright (c) 2021, Arm Limited.
This commit is contained in:
11
build.rs
11
build.rs
@@ -202,13 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
|
||||
("simd", _) if cfg!(feature = "old-x86-backend") => return true,
|
||||
// No simd support yet for s390x.
|
||||
("simd", _) if platform_is_s390x() => return true,
|
||||
// These are new instructions that are only known to be supported for x64.
|
||||
("simd", "simd_i16x8_extadd_pairwise_i8x16")
|
||||
| ("simd", "simd_i32x4_extadd_pairwise_i16x8")
|
||||
if !platform_is_x64() =>
|
||||
{
|
||||
return true
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
_ => panic!("unrecognized strategy"),
|
||||
@@ -217,10 +210,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn platform_is_x64() -> bool {
|
||||
env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "x86_64"
|
||||
}
|
||||
|
||||
fn platform_is_s390x() -> bool {
|
||||
env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "s390x"
|
||||
}
|
||||
|
||||
@@ -459,6 +459,17 @@ fn enc_vec_rr_pair(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
| machreg_to_vec(rd.to_reg())
|
||||
}
|
||||
|
||||
fn enc_vec_rr_pair_long(u: u32, enc_size: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
debug_assert_eq!(u & 0b1, u);
|
||||
debug_assert_eq!(enc_size & 0b1, enc_size);
|
||||
|
||||
0b0_1_0_01110_00_10000_00_0_10_10_00000_00000
|
||||
| u << 29
|
||||
| enc_size << 22
|
||||
| machreg_to_vec(rn) << 5
|
||||
| machreg_to_vec(rd.to_reg())
|
||||
}
|
||||
|
||||
fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
debug_assert_eq!(q & 0b1, q);
|
||||
debug_assert_eq!(u & 0b1, u);
|
||||
@@ -2225,6 +2236,16 @@ impl MachInstEmit for Inst {
|
||||
rd,
|
||||
));
|
||||
}
|
||||
&Inst::VecRRPairLong { op, rd, rn } => {
|
||||
let (u, size) = match op {
|
||||
VecRRPairLongOp::Saddlp8 => (0b0, 0b0),
|
||||
VecRRPairLongOp::Uaddlp8 => (0b1, 0b0),
|
||||
VecRRPairLongOp::Saddlp16 => (0b0, 0b1),
|
||||
VecRRPairLongOp::Uaddlp16 => (0b1, 0b1),
|
||||
};
|
||||
|
||||
sink.put4(enc_vec_rr_pair_long(u, size, rd, rn));
|
||||
}
|
||||
&Inst::VecRRR {
|
||||
rd,
|
||||
rn,
|
||||
|
||||
@@ -2643,6 +2643,46 @@ fn test_aarch64_binemit() {
|
||||
"addp d0, v30.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRPairLong {
|
||||
op: VecRRPairLongOp::Uaddlp8,
|
||||
rd: writable_vreg(0),
|
||||
rn: vreg(1),
|
||||
},
|
||||
"2028206E",
|
||||
"uaddlp v0.8h, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRPairLong {
|
||||
op: VecRRPairLongOp::Saddlp8,
|
||||
rd: writable_vreg(3),
|
||||
rn: vreg(11),
|
||||
},
|
||||
"6329204E",
|
||||
"saddlp v3.8h, v11.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRPairLong {
|
||||
op: VecRRPairLongOp::Uaddlp16,
|
||||
rd: writable_vreg(14),
|
||||
rn: vreg(23),
|
||||
},
|
||||
"EE2A606E",
|
||||
"uaddlp v14.4s, v23.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRPairLong {
|
||||
op: VecRRPairLongOp::Saddlp16,
|
||||
rd: writable_vreg(29),
|
||||
rn: vreg(0),
|
||||
},
|
||||
"1D28604E",
|
||||
"saddlp v29.4s, v0.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Sqadd,
|
||||
|
||||
@@ -419,6 +419,18 @@ pub enum VecPairOp {
|
||||
Addp,
|
||||
}
|
||||
|
||||
/// 1-operand vector instruction that extends elements of the input register
|
||||
/// and operates on a pair of elements.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum VecRRPairLongOp {
|
||||
/// Sign extend and add pair of elements
|
||||
Saddlp8,
|
||||
Saddlp16,
|
||||
/// Unsigned extend and add pair of elements
|
||||
Uaddlp8,
|
||||
Uaddlp16,
|
||||
}
|
||||
|
||||
/// An operation across the lanes of vectors.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum VecLanesOp {
|
||||
@@ -1107,6 +1119,15 @@ pub enum Inst {
|
||||
high_half: bool,
|
||||
},
|
||||
|
||||
/// 1-operand vector instruction that extends elements of the input
|
||||
/// register and operates on a pair of elements. The output lane width
|
||||
/// is double that of the input.
|
||||
VecRRPairLong {
|
||||
op: VecRRPairLongOp,
|
||||
rd: Writable<Reg>,
|
||||
rn: Reg,
|
||||
},
|
||||
|
||||
/// A vector ALU op.
|
||||
VecRRR {
|
||||
alu_op: VecALUOp,
|
||||
@@ -2166,6 +2187,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
collector.add_use(rn);
|
||||
collector.add_use(rm);
|
||||
}
|
||||
&Inst::VecRRPairLong { rd, rn, .. } => {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
&Inst::VecRRR {
|
||||
alu_op, rd, rn, rm, ..
|
||||
} => {
|
||||
@@ -2992,6 +3017,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
map_use(mapper, rn);
|
||||
map_use(mapper, rm);
|
||||
}
|
||||
&mut Inst::VecRRPairLong {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
..
|
||||
} => {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::VecRRR {
|
||||
alu_op,
|
||||
ref mut rd,
|
||||
@@ -4152,6 +4185,26 @@ impl Inst {
|
||||
|
||||
format!("{} {}, {}", op, rd, rn)
|
||||
}
|
||||
&Inst::VecRRPairLong { op, rd, rn } => {
|
||||
let (op, dest, src) = match op {
|
||||
VecRRPairLongOp::Saddlp8 => {
|
||||
("saddlp", VectorSize::Size16x8, VectorSize::Size8x16)
|
||||
}
|
||||
VecRRPairLongOp::Saddlp16 => {
|
||||
("saddlp", VectorSize::Size32x4, VectorSize::Size16x8)
|
||||
}
|
||||
VecRRPairLongOp::Uaddlp8 => {
|
||||
("uaddlp", VectorSize::Size16x8, VectorSize::Size8x16)
|
||||
}
|
||||
VecRRPairLongOp::Uaddlp16 => {
|
||||
("uaddlp", VectorSize::Size32x4, VectorSize::Size16x8)
|
||||
}
|
||||
};
|
||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
|
||||
let rn = show_vreg_vector(rn, mb_rru, src);
|
||||
|
||||
format!("{} {}, {}", op, rd, rn)
|
||||
}
|
||||
&Inst::VecRRR {
|
||||
rd,
|
||||
rn,
|
||||
|
||||
@@ -2644,6 +2644,58 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
});
|
||||
}
|
||||
|
||||
Opcode::IaddPairwise => {
|
||||
let ty = ty.unwrap();
|
||||
let lane_type = ty.lane_type();
|
||||
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
|
||||
let mut match_long_pair =
|
||||
|ext_low_op, ext_high_op| -> Option<(VecRRPairLongOp, regalloc::Reg)> {
|
||||
if let Some(lhs) = maybe_input_insn(ctx, inputs[0], ext_low_op) {
|
||||
if let Some(rhs) = maybe_input_insn(ctx, inputs[1], ext_high_op) {
|
||||
let lhs_inputs = insn_inputs(ctx, lhs);
|
||||
let rhs_inputs = insn_inputs(ctx, rhs);
|
||||
let low = put_input_in_reg(ctx, lhs_inputs[0], NarrowValueMode::None);
|
||||
let high = put_input_in_reg(ctx, rhs_inputs[0], NarrowValueMode::None);
|
||||
if low == high {
|
||||
match (lane_type, ext_low_op) {
|
||||
(I16, Opcode::SwidenLow) => {
|
||||
return Some((VecRRPairLongOp::Saddlp8, low))
|
||||
}
|
||||
(I32, Opcode::SwidenLow) => {
|
||||
return Some((VecRRPairLongOp::Saddlp16, low))
|
||||
}
|
||||
(I16, Opcode::UwidenLow) => {
|
||||
return Some((VecRRPairLongOp::Uaddlp8, low))
|
||||
}
|
||||
(I32, Opcode::UwidenLow) => {
|
||||
return Some((VecRRPairLongOp::Uaddlp16, low))
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
};
|
||||
|
||||
if let Some((op, rn)) = match_long_pair(Opcode::SwidenLow, Opcode::SwidenHigh) {
|
||||
ctx.emit(Inst::VecRRPairLong { op, rd, rn });
|
||||
} else if let Some((op, rn)) = match_long_pair(Opcode::UwidenLow, Opcode::UwidenHigh) {
|
||||
ctx.emit(Inst::VecRRPairLong { op, rd, rn });
|
||||
} else {
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Addp,
|
||||
rd: rd,
|
||||
rn: rn,
|
||||
rm: rm,
|
||||
size: VectorSize::from_ty(ty),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::WideningPairwiseDotProductS => {
|
||||
let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
@@ -3519,7 +3571,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
});
|
||||
}
|
||||
|
||||
Opcode::IaddPairwise | Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
|
||||
Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
|
||||
unimplemented!("lowering {}", op)
|
||||
}
|
||||
}
|
||||
|
||||
124
cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
Normal file
124
cranelift/filetests/filetests/isa/aarch64/simd-pairwise-add.clif
Normal file
@@ -0,0 +1,124 @@
|
||||
test compile
|
||||
set unwind_info=false
|
||||
target aarch64
|
||||
|
||||
|
||||
function %fn1(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
v1 = swiden_low v0
|
||||
v2 = swiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: saddlp v0.8h, v0.16b
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn2(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
v1 = uwiden_low v0
|
||||
v2 = uwiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: uaddlp v0.8h, v0.16b
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn3(i16x8) -> i32x4 {
|
||||
block0(v0: i16x8):
|
||||
v1 = swiden_low v0
|
||||
v2 = swiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: saddlp v0.4s, v0.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn4(i16x8) -> i32x4 {
|
||||
block0(v0: i16x8):
|
||||
v1 = uwiden_low v0
|
||||
v2 = uwiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: uaddlp v0.4s, v0.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn5(i8x16, i8x16) -> i16x8 {
|
||||
block0(v0: i8x16, v1: i8x16):
|
||||
v2 = swiden_low v0
|
||||
v3 = swiden_high v1
|
||||
v4 = iadd_pairwise v2, v3
|
||||
return v4
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sxtl v0.8h, v0.8b
|
||||
; nextln: sxtl2 v1.8h, v1.16b
|
||||
; nextln: addp v0.8h, v0.8h, v1.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn6(i8x16, i8x16) -> i16x8 {
|
||||
block0(v0: i8x16, v1: i8x16):
|
||||
v2 = uwiden_low v0
|
||||
v3 = uwiden_high v1
|
||||
v4 = iadd_pairwise v2, v3
|
||||
return v4
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: uxtl v0.8h, v0.8b
|
||||
; nextln: uxtl2 v1.8h, v1.16b
|
||||
; nextln: addp v0.8h, v0.8h, v1.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn7(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
v1 = uwiden_low v0
|
||||
v2 = swiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: uxtl v1.8h, v0.8b
|
||||
; nextln: sxtl2 v0.8h, v0.16b
|
||||
; nextln: addp v0.8h, v1.8h, v0.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %fn8(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
v1 = swiden_low v0
|
||||
v2 = uwiden_high v0
|
||||
v3 = iadd_pairwise v1, v2
|
||||
return v3
|
||||
}
|
||||
|
||||
; check: stp fp
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sxtl v1.8h, v0.8b
|
||||
; nextln: uxtl2 v0.8h, v0.16b
|
||||
; nextln: addp v0.8h, v1.8h, v0.8h
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
Reference in New Issue
Block a user