Merge pull request #1890 from jgouly/simd-bool
arm64: Implement AllTrue and AnyTrue
This commit is contained in:
1
build.rs
1
build.rs
@@ -182,6 +182,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
|
||||
"Cranelift" => match (testsuite, testname) {
|
||||
("simd", "simd_address") => return false,
|
||||
("simd", "simd_bitwise") => return false,
|
||||
("simd", "simd_boolean") => return false,
|
||||
("simd", "simd_i8x16_cmp") => return false,
|
||||
("simd", "simd_i16x8_cmp") => return false,
|
||||
("simd", "simd_i32x4_cmp") => return false,
|
||||
|
||||
@@ -361,6 +361,20 @@ fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
|
||||
}
|
||||
|
||||
fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||
debug_assert_eq!(q & 0b1, q);
|
||||
debug_assert_eq!(u & 0b1, u);
|
||||
debug_assert_eq!(size & 0b11, size);
|
||||
debug_assert_eq!(opcode & 0b11111, opcode);
|
||||
0b0_0_0_01110_00_11000_0_0000_10_00000_00000
|
||||
| q << 30
|
||||
| u << 29
|
||||
| size << 22
|
||||
| opcode << 12
|
||||
| machreg_to_vec(rn) << 5
|
||||
| machreg_to_vec(rd.to_reg())
|
||||
}
|
||||
|
||||
/// State carried between emissions of a sequence of instructions.
|
||||
#[derive(Default, Clone, Debug)]
|
||||
pub struct EmitState {
|
||||
@@ -1061,6 +1075,18 @@ impl MachInstEmit for Inst {
|
||||
};
|
||||
sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
|
||||
}
|
||||
&Inst::VecLanes { op, rd, rn, ty } => {
|
||||
let (q, size) = match ty {
|
||||
I8X16 => (0b1, 0b00),
|
||||
I16X8 => (0b1, 0b01),
|
||||
I32X4 => (0b1, 0b10),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let (u, opcode) = match op {
|
||||
VecLanesOp::Uminv => (0b1, 0b11010),
|
||||
};
|
||||
sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn));
|
||||
}
|
||||
&Inst::FpuCmp32 { rn, rm } => {
|
||||
sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
|
||||
}
|
||||
@@ -1247,7 +1273,7 @@ impl MachInstEmit for Inst {
|
||||
alu_op,
|
||||
ty,
|
||||
} => {
|
||||
let enc_size_for_cmp = match ty {
|
||||
let enc_size = match ty {
|
||||
I8X16 => 0b00,
|
||||
I16X8 => 0b01,
|
||||
I32X4 => 0b10,
|
||||
@@ -1271,12 +1297,12 @@ impl MachInstEmit for Inst {
|
||||
debug_assert_eq!(I64, ty);
|
||||
(0b011_11110_11_1, 0b001011)
|
||||
}
|
||||
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011),
|
||||
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
|
||||
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
|
||||
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
|
||||
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
|
||||
// The following instructions operate on bytes, so are not encoded differently
|
||||
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
|
||||
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
|
||||
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
|
||||
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
|
||||
// The following logical instructions operate on bytes, so are not encoded differently
|
||||
// for the different vector types.
|
||||
VecALUOp::And => {
|
||||
debug_assert_eq!(128, ty_bits(ty));
|
||||
@@ -1298,6 +1324,7 @@ impl MachInstEmit for Inst {
|
||||
debug_assert_eq!(128, ty_bits(ty));
|
||||
(0b011_01110_01_1, 0b000111)
|
||||
}
|
||||
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
|
||||
};
|
||||
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
||||
}
|
||||
|
||||
@@ -2269,6 +2269,42 @@ fn test_aarch64_binemit() {
|
||||
"bsl v8.16b, v9.16b, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Umaxp,
|
||||
rd: writable_vreg(8),
|
||||
rn: vreg(12),
|
||||
rm: vreg(1),
|
||||
ty: I8X16,
|
||||
},
|
||||
"88A5216E",
|
||||
"umaxp v8.16b, v12.16b, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Umaxp,
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(6),
|
||||
rm: vreg(1),
|
||||
ty: I16X8,
|
||||
},
|
||||
"C1A4616E",
|
||||
"umaxp v1.8h, v6.8h, v1.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Umaxp,
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(20),
|
||||
rm: vreg(16),
|
||||
ty: I32X4,
|
||||
},
|
||||
"81A6B06E",
|
||||
"umaxp v1.4s, v20.4s, v16.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Not,
|
||||
@@ -2280,6 +2316,39 @@ fn test_aarch64_binemit() {
|
||||
"mvn v2.16b, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Uminv,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(1),
|
||||
ty: I8X16,
|
||||
},
|
||||
"22A8316E",
|
||||
"uminv b2, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Uminv,
|
||||
rd: writable_vreg(3),
|
||||
rn: vreg(11),
|
||||
ty: I16X8,
|
||||
},
|
||||
"63A9716E",
|
||||
"uminv h3, v11.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Uminv,
|
||||
rd: writable_vreg(18),
|
||||
rn: vreg(4),
|
||||
ty: I32X4,
|
||||
},
|
||||
"92A8B16E",
|
||||
"uminv s18, v4.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::Extend {
|
||||
rd: writable_xreg(1),
|
||||
|
||||
@@ -304,6 +304,14 @@ impl Imm12 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a zero immediate of this format.
|
||||
pub fn zero() -> Self {
|
||||
Imm12 {
|
||||
bits: 0,
|
||||
shift12: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Bits for 2-bit "shift" field in e.g. AddI.
|
||||
pub fn shift_bits(&self) -> u32 {
|
||||
if self.shift12 {
|
||||
|
||||
@@ -235,6 +235,8 @@ pub enum VecALUOp {
|
||||
Eor,
|
||||
/// Bitwise select
|
||||
Bsl,
|
||||
/// Unsigned maximum pairwise
|
||||
Umaxp,
|
||||
}
|
||||
|
||||
/// A Vector miscellaneous operation with two registers.
|
||||
@@ -244,6 +246,13 @@ pub enum VecMisc2 {
|
||||
Not,
|
||||
}
|
||||
|
||||
/// An operation across the lanes of vectors.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum VecLanesOp {
|
||||
/// Unsigned minimum across a vector
|
||||
Uminv,
|
||||
}
|
||||
|
||||
/// An operation on the bits of a register. This can be paired with several instruction formats
|
||||
/// below (see `Inst`) in any combination.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
@@ -743,6 +752,14 @@ pub enum Inst {
|
||||
ty: Type,
|
||||
},
|
||||
|
||||
/// Vector instruction across lanes.
|
||||
VecLanes {
|
||||
op: VecLanesOp,
|
||||
rd: Writable<Reg>,
|
||||
rn: Reg,
|
||||
ty: Type,
|
||||
},
|
||||
|
||||
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
|
||||
MovToNZCV {
|
||||
rn: Reg,
|
||||
@@ -1214,6 +1231,11 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
|
||||
&Inst::VecLanes { rd, rn, .. } => {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
|
||||
collector.add_use(rn);
|
||||
collector.add_use(rm);
|
||||
@@ -1708,6 +1730,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::VecLanes {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
..
|
||||
} => {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::FpuCmp32 {
|
||||
ref mut rn,
|
||||
ref mut rm,
|
||||
@@ -2482,7 +2512,7 @@ impl ShowWithRRU for Inst {
|
||||
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
|
||||
|reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
|
||||
} else {
|
||||
show_vreg_scalar
|
||||
|reg, mb_rru| show_vreg_scalar(reg, mb_rru, F64)
|
||||
};
|
||||
let rd = show_vreg_fn(rd.to_reg(), mb_rru);
|
||||
let rn = show_vreg_fn(rn, mb_rru);
|
||||
@@ -2695,12 +2725,13 @@ impl ShowWithRRU for Inst {
|
||||
VecALUOp::Orr => ("orr", true, I8X16),
|
||||
VecALUOp::Eor => ("eor", true, I8X16),
|
||||
VecALUOp::Bsl => ("bsl", true, I8X16),
|
||||
VecALUOp::Umaxp => ("umaxp", true, ty),
|
||||
};
|
||||
|
||||
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
|
||||
|reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
|
||||
} else {
|
||||
|reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru)
|
||||
|reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, I64)
|
||||
};
|
||||
|
||||
let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
|
||||
@@ -2722,6 +2753,15 @@ impl ShowWithRRU for Inst {
|
||||
let rn = show_vreg_vector(rn, mb_rru, ty);
|
||||
format!("{} {}, {}", op, rd, rn)
|
||||
}
|
||||
&Inst::VecLanes { op, rd, rn, ty } => {
|
||||
let op = match op {
|
||||
VecLanesOp::Uminv => "uminv",
|
||||
};
|
||||
|
||||
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ty);
|
||||
let rn = show_vreg_vector(rn, mb_rru, ty);
|
||||
format!("{} {}, {}", op, rd, rn)
|
||||
}
|
||||
&Inst::MovToNZCV { rn } => {
|
||||
let rn = rn.show_rru(mb_rru);
|
||||
format!("msr nzcv, {}", rn)
|
||||
|
||||
@@ -292,7 +292,7 @@ pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz
|
||||
}
|
||||
|
||||
/// Show a vector register used in a scalar context.
|
||||
pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
|
||||
pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
|
||||
let mut s = reg.show_rru(mb_rru);
|
||||
if reg.get_class() != RegClass::V128 {
|
||||
// We can't do any better.
|
||||
@@ -302,7 +302,14 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
|
||||
if reg.is_real() {
|
||||
// Change (eg) "v0" into "d0".
|
||||
if reg.get_class() == RegClass::V128 && s.starts_with("v") {
|
||||
s.replace_range(0..1, "d");
|
||||
let replacement = match ty {
|
||||
I64 | F64 => "d",
|
||||
I8X16 => "b",
|
||||
I16X8 => "h",
|
||||
I32X4 => "s",
|
||||
_ => unimplemented!(),
|
||||
};
|
||||
s.replace_range(0..1, replacement);
|
||||
}
|
||||
} else {
|
||||
// Add a "d" suffix to RegClass::V128 vregs.
|
||||
|
||||
@@ -1540,12 +1540,58 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
ctx.emit(inst);
|
||||
}
|
||||
|
||||
Opcode::VanyTrue | Opcode::VallTrue => {
|
||||
let rd = output_to_reg(ctx, outputs[0]);
|
||||
let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap());
|
||||
|
||||
// This operation is implemented by using umaxp or uminv to
|
||||
// create a scalar value, which is then compared against zero.
|
||||
//
|
||||
// umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
|
||||
// mov xm, vn.d[0]
|
||||
// cmp xm, #0
|
||||
// cset xm, ne
|
||||
|
||||
let input_ty = ctx.input_ty(insn, 0);
|
||||
if op == Opcode::VanyTrue {
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Umaxp,
|
||||
rd: tmp,
|
||||
rn: rm,
|
||||
rm: rm,
|
||||
ty: input_ty,
|
||||
});
|
||||
} else {
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Uminv,
|
||||
rd: tmp,
|
||||
rn: rm,
|
||||
ty: input_ty,
|
||||
});
|
||||
};
|
||||
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd,
|
||||
rn: tmp.to_reg(),
|
||||
idx: 0,
|
||||
ty: I64,
|
||||
});
|
||||
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::SubS64,
|
||||
rd: writable_zero_reg(),
|
||||
rn: rd.to_reg(),
|
||||
imm12: Imm12::zero(),
|
||||
});
|
||||
|
||||
ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
|
||||
}
|
||||
|
||||
Opcode::Shuffle
|
||||
| Opcode::Vsplit
|
||||
| Opcode::Vconcat
|
||||
| Opcode::Vselect
|
||||
| Opcode::VanyTrue
|
||||
| Opcode::VallTrue
|
||||
| Opcode::Insertlane
|
||||
| Opcode::ScalarToVector
|
||||
| Opcode::Swizzle
|
||||
|
||||
Reference in New Issue
Block a user