Merge pull request #1890 from jgouly/simd-bool

arm64: Implement AllTrue and AnyTrue
This commit is contained in:
Chris Fallin
2020-06-17 09:38:10 -07:00
committed by GitHub
7 changed files with 211 additions and 13 deletions

View File

@@ -182,6 +182,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
"Cranelift" => match (testsuite, testname) {
("simd", "simd_address") => return false,
("simd", "simd_bitwise") => return false,
("simd", "simd_boolean") => return false,
("simd", "simd_i8x16_cmp") => return false,
("simd", "simd_i16x8_cmp") => return false,
("simd", "simd_i32x4_cmp") => return false,

View File

@@ -361,6 +361,20 @@ fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
}
fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
debug_assert_eq!(q & 0b1, q);
debug_assert_eq!(u & 0b1, u);
debug_assert_eq!(size & 0b11, size);
debug_assert_eq!(opcode & 0b11111, opcode);
0b0_0_0_01110_00_11000_0_0000_10_00000_00000
| q << 30
| u << 29
| size << 22
| opcode << 12
| machreg_to_vec(rn) << 5
| machreg_to_vec(rd.to_reg())
}
/// State carried between emissions of a sequence of instructions.
#[derive(Default, Clone, Debug)]
pub struct EmitState {
@@ -1061,6 +1075,18 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
}
&Inst::VecLanes { op, rd, rn, ty } => {
let (q, size) = match ty {
I8X16 => (0b1, 0b00),
I16X8 => (0b1, 0b01),
I32X4 => (0b1, 0b10),
_ => unreachable!(),
};
let (u, opcode) = match op {
VecLanesOp::Uminv => (0b1, 0b11010),
};
sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn));
}
&Inst::FpuCmp32 { rn, rm } => {
sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
}
@@ -1247,7 +1273,7 @@ impl MachInstEmit for Inst {
alu_op,
ty,
} => {
let enc_size_for_cmp = match ty {
let enc_size = match ty {
I8X16 => 0b00,
I16X8 => 0b01,
I32X4 => 0b10,
@@ -1271,12 +1297,12 @@ impl MachInstEmit for Inst {
debug_assert_eq!(I64, ty);
(0b011_11110_11_1, 0b001011)
}
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011),
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
// The following instructions operate on bytes, so are not encoded differently
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
// The following logical instructions operate on bytes, so are not encoded differently
// for the different vector types.
VecALUOp::And => {
debug_assert_eq!(128, ty_bits(ty));
@@ -1298,6 +1324,7 @@ impl MachInstEmit for Inst {
debug_assert_eq!(128, ty_bits(ty));
(0b011_01110_01_1, 0b000111)
}
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}

View File

@@ -2269,6 +2269,42 @@ fn test_aarch64_binemit() {
"bsl v8.16b, v9.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Umaxp,
rd: writable_vreg(8),
rn: vreg(12),
rm: vreg(1),
ty: I8X16,
},
"88A5216E",
"umaxp v8.16b, v12.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Umaxp,
rd: writable_vreg(1),
rn: vreg(6),
rm: vreg(1),
ty: I16X8,
},
"C1A4616E",
"umaxp v1.8h, v6.8h, v1.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Umaxp,
rd: writable_vreg(1),
rn: vreg(20),
rm: vreg(16),
ty: I32X4,
},
"81A6B06E",
"umaxp v1.4s, v20.4s, v16.4s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Not,
@@ -2280,6 +2316,39 @@ fn test_aarch64_binemit() {
"mvn v2.16b, v1.16b",
));
insns.push((
Inst::VecLanes {
op: VecLanesOp::Uminv,
rd: writable_vreg(2),
rn: vreg(1),
ty: I8X16,
},
"22A8316E",
"uminv b2, v1.16b",
));
insns.push((
Inst::VecLanes {
op: VecLanesOp::Uminv,
rd: writable_vreg(3),
rn: vreg(11),
ty: I16X8,
},
"63A9716E",
"uminv h3, v11.8h",
));
insns.push((
Inst::VecLanes {
op: VecLanesOp::Uminv,
rd: writable_vreg(18),
rn: vreg(4),
ty: I32X4,
},
"92A8B16E",
"uminv s18, v4.4s",
));
insns.push((
Inst::Extend {
rd: writable_xreg(1),

View File

@@ -304,6 +304,14 @@ impl Imm12 {
}
}
/// Create a zero immediate of this format.
pub fn zero() -> Self {
Imm12 {
bits: 0,
shift12: false,
}
}
/// Bits for 2-bit "shift" field in e.g. AddI.
pub fn shift_bits(&self) -> u32 {
if self.shift12 {

View File

@@ -235,6 +235,8 @@ pub enum VecALUOp {
Eor,
/// Bitwise select
Bsl,
/// Unsigned maximum pairwise
Umaxp,
}
/// A Vector miscellaneous operation with two registers.
@@ -244,6 +246,13 @@ pub enum VecMisc2 {
Not,
}
/// An operation across the lanes of vectors.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum VecLanesOp {
/// Unsigned minimum across a vector
Uminv,
}
/// An operation on the bits of a register. This can be paired with several instruction formats
/// below (see `Inst`) in any combination.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
@@ -743,6 +752,14 @@ pub enum Inst {
ty: Type,
},
/// Vector instruction across lanes.
VecLanes {
op: VecLanesOp,
rd: Writable<Reg>,
rn: Reg,
ty: Type,
},
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
MovToNZCV {
rn: Reg,
@@ -1214,6 +1231,11 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::VecLanes { rd, rn, .. } => {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
collector.add_use(rn);
collector.add_use(rm);
@@ -1708,6 +1730,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::VecLanes {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::FpuCmp32 {
ref mut rn,
ref mut rm,
@@ -2482,7 +2512,7 @@ impl ShowWithRRU for Inst {
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
|reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
} else {
show_vreg_scalar
|reg, mb_rru| show_vreg_scalar(reg, mb_rru, F64)
};
let rd = show_vreg_fn(rd.to_reg(), mb_rru);
let rn = show_vreg_fn(rn, mb_rru);
@@ -2695,12 +2725,13 @@ impl ShowWithRRU for Inst {
VecALUOp::Orr => ("orr", true, I8X16),
VecALUOp::Eor => ("eor", true, I8X16),
VecALUOp::Bsl => ("bsl", true, I8X16),
VecALUOp::Umaxp => ("umaxp", true, ty),
};
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
|reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
} else {
|reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru)
|reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, I64)
};
let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
@@ -2722,6 +2753,15 @@ impl ShowWithRRU for Inst {
let rn = show_vreg_vector(rn, mb_rru, ty);
format!("{} {}, {}", op, rd, rn)
}
&Inst::VecLanes { op, rd, rn, ty } => {
let op = match op {
VecLanesOp::Uminv => "uminv",
};
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ty);
let rn = show_vreg_vector(rn, mb_rru, ty);
format!("{} {}, {}", op, rd, rn)
}
&Inst::MovToNZCV { rn } => {
let rn = rn.show_rru(mb_rru);
format!("msr nzcv, {}", rn)

View File

@@ -292,7 +292,7 @@ pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz
}
/// Show a vector register used in a scalar context.
pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
let mut s = reg.show_rru(mb_rru);
if reg.get_class() != RegClass::V128 {
// We can't do any better.
@@ -302,7 +302,14 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
if reg.is_real() {
// Change (eg) "v0" into "d0".
if reg.get_class() == RegClass::V128 && s.starts_with("v") {
s.replace_range(0..1, "d");
let replacement = match ty {
I64 | F64 => "d",
I8X16 => "b",
I16X8 => "h",
I32X4 => "s",
_ => unimplemented!(),
};
s.replace_range(0..1, replacement);
}
} else {
// Add a "d" suffix to RegClass::V128 vregs.

View File

@@ -1540,12 +1540,58 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(inst);
}
Opcode::VanyTrue | Opcode::VallTrue => {
let rd = output_to_reg(ctx, outputs[0]);
let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap());
// This operation is implemented by using umaxp or uminv to
// create a scalar value, which is then compared against zero.
//
// umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
// mov xm, vn.d[0]
// cmp xm, #0
// cset xm, ne
let input_ty = ctx.input_ty(insn, 0);
if op == Opcode::VanyTrue {
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Umaxp,
rd: tmp,
rn: rm,
rm: rm,
ty: input_ty,
});
} else {
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Uminv,
rd: tmp,
rn: rm,
ty: input_ty,
});
};
ctx.emit(Inst::MovFromVec {
rd,
rn: tmp.to_reg(),
idx: 0,
ty: I64,
});
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::SubS64,
rd: writable_zero_reg(),
rn: rd.to_reg(),
imm12: Imm12::zero(),
});
ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
}
Opcode::Shuffle
| Opcode::Vsplit
| Opcode::Vconcat
| Opcode::Vselect
| Opcode::VanyTrue
| Opcode::VallTrue
| Opcode::Insertlane
| Opcode::ScalarToVector
| Opcode::Swizzle