From 0f462330e0562b78c7565fbc1df9650ff1342a15 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Wed, 17 Jun 2020 15:40:51 +0100 Subject: [PATCH] arm64: Implement AllTrue and AnyTrue This enables the simd_boolean WASM SIMD spec test. Copyright (c) 2020, Arm Limited. --- build.rs | 1 + .../codegen/src/isa/aarch64/inst/emit.rs | 41 +++++++++-- .../src/isa/aarch64/inst/emit_tests.rs | 69 +++++++++++++++++++ .../codegen/src/isa/aarch64/inst/imms.rs | 8 +++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 44 +++++++++++- .../codegen/src/isa/aarch64/inst/regs.rs | 11 ++- .../codegen/src/isa/aarch64/lower_inst.rs | 50 +++++++++++++- 7 files changed, 211 insertions(+), 13 deletions(-) diff --git a/build.rs b/build.rs index 40b4385b5e..fdf21f0e12 100644 --- a/build.rs +++ b/build.rs @@ -182,6 +182,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "Cranelift" => match (testsuite, testname) { ("simd", "simd_address") => return false, ("simd", "simd_bitwise") => return false, + ("simd", "simd_boolean") => return false, ("simd", "simd_i8x16_cmp") => return false, ("simd", "simd_i16x8_cmp") => return false, ("simd", "simd_i32x4_cmp") => return false, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 263241835f..7668465d62 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -361,6 +361,20 @@ fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg()) } +fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable, rn: Reg) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(u & 0b1, u); + debug_assert_eq!(size & 0b11, size); + debug_assert_eq!(opcode & 0b11111, opcode); + 0b0_0_0_01110_00_11000_0_0000_10_00000_00000 + | q << 30 + | u << 29 + | size << 22 + | opcode << 12 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()) +} + /// State carried between emissions of a sequence of instructions. #[derive(Default, Clone, Debug)] pub struct EmitState { @@ -1061,6 +1075,18 @@ impl MachInstEmit for Inst { }; sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn)); } + &Inst::VecLanes { op, rd, rn, ty } => { + let (q, size) = match ty { + I8X16 => (0b1, 0b00), + I16X8 => (0b1, 0b01), + I32X4 => (0b1, 0b10), + _ => unreachable!(), + }; + let (u, opcode) = match op { + VecLanesOp::Uminv => (0b1, 0b11010), + }; + sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn)); + } &Inst::FpuCmp32 { rn, rm } => { sink.put4(enc_fcmp(InstSize::Size32, rn, rm)); } @@ -1247,7 +1273,7 @@ impl MachInstEmit for Inst { alu_op, ty, } => { - let enc_size_for_cmp = match ty { + let enc_size = match ty { I8X16 => 0b00, I16X8 => 0b01, I32X4 => 0b10, @@ -1271,12 +1297,12 @@ impl MachInstEmit for Inst { debug_assert_eq!(I64, ty); (0b011_11110_11_1, 0b001011) } - VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011), - VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111), - VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101), - VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101), - VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111), - // The following instructions operate on bytes, so are not encoded differently + VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011), + VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111), + // The following logical instructions operate on bytes, so are not encoded differently // for the different vector types. VecALUOp::And => { debug_assert_eq!(128, ty_bits(ty)); @@ -1298,6 +1324,7 @@ impl MachInstEmit for Inst { debug_assert_eq!(128, ty_bits(ty)); (0b011_01110_01_1, 0b000111) } + VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001), }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 7b2c095035..05dce50151 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2269,6 +2269,42 @@ fn test_aarch64_binemit() { "bsl v8.16b, v9.16b, v1.16b", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(8), + rn: vreg(12), + rm: vreg(1), + ty: I8X16, + }, + "88A5216E", + "umaxp v8.16b, v12.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(1), + rn: vreg(6), + rm: vreg(1), + ty: I16X8, + }, + "C1A4616E", + "umaxp v1.8h, v6.8h, v1.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: writable_vreg(1), + rn: vreg(20), + rm: vreg(16), + ty: I32X4, + }, + "81A6B06E", + "umaxp v1.4s, v20.4s, v16.4s", + )); + insns.push(( Inst::VecMisc { op: VecMisc2::Not, @@ -2280,6 +2316,39 @@ fn test_aarch64_binemit() { "mvn v2.16b, v1.16b", )); + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(2), + rn: vreg(1), + ty: I8X16, + }, + "22A8316E", + "uminv b2, v1.16b", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(3), + rn: vreg(11), + ty: I16X8, + }, + "63A9716E", + "uminv h3, v11.8h", + )); + + insns.push(( + Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: writable_vreg(18), + rn: vreg(4), + ty: I32X4, + }, + "92A8B16E", + "uminv s18, v4.4s", + )); + insns.push(( Inst::Extend { rd: writable_xreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs index 6fea5efb5c..961559cc9f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -304,6 +304,14 @@ impl Imm12 { } } + /// Create a zero immediate of this format. + pub fn zero() -> Self { + Imm12 { + bits: 0, + shift12: false, + } + } + /// Bits for 2-bit "shift" field in e.g. AddI. pub fn shift_bits(&self) -> u32 { if self.shift12 { diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 6c5eb4d995..6d14d53448 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -235,6 +235,8 @@ pub enum VecALUOp { Eor, /// Bitwise select Bsl, + /// Unsigned maximum pairwise + Umaxp, } /// A Vector miscellaneous operation with two registers. @@ -244,6 +246,13 @@ pub enum VecMisc2 { Not, } +/// An operation across the lanes of vectors. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecLanesOp { + /// Unsigned minimum across a vector + Uminv, +} + /// An operation on the bits of a register. This can be paired with several instruction formats /// below (see `Inst`) in any combination. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] @@ -743,6 +752,14 @@ pub enum Inst { ty: Type, }, + /// Vector instruction across lanes. + VecLanes { + op: VecLanesOp, + rd: Writable, + rn: Reg, + ty: Type, + }, + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). MovToNZCV { rn: Reg, @@ -1214,6 +1231,11 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + + &Inst::VecLanes { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { collector.add_use(rn); collector.add_use(rm); @@ -1708,6 +1730,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::VecLanes { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuCmp32 { ref mut rn, ref mut rm, @@ -2482,7 +2512,7 @@ impl ShowWithRRU for Inst { let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector { |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2) } else { - show_vreg_scalar + |reg, mb_rru| show_vreg_scalar(reg, mb_rru, F64) }; let rd = show_vreg_fn(rd.to_reg(), mb_rru); let rn = show_vreg_fn(rn, mb_rru); @@ -2695,12 +2725,13 @@ impl ShowWithRRU for Inst { VecALUOp::Orr => ("orr", true, I8X16), VecALUOp::Eor => ("eor", true, I8X16), VecALUOp::Bsl => ("bsl", true, I8X16), + VecALUOp::Umaxp => ("umaxp", true, ty), }; let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector { |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty) } else { - |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru) + |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, I64) }; let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty); @@ -2722,6 +2753,15 @@ impl ShowWithRRU for Inst { let rn = show_vreg_vector(rn, mb_rru, ty); format!("{} {}, {}", op, rd, rn) } + &Inst::VecLanes { op, rd, rn, ty } => { + let op = match op { + VecLanesOp::Uminv => "uminv", + }; + + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ty); + let rn = show_vreg_vector(rn, mb_rru, ty); + format!("{} {}, {}", op, rd, rn) + } &Inst::MovToNZCV { rn } => { let rn = rn.show_rru(mb_rru); format!("msr nzcv, {}", rn) diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index 9d74661256..b92b0b70c9 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -292,7 +292,7 @@ pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz } /// Show a vector register used in a scalar context. -pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String { +pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String { let mut s = reg.show_rru(mb_rru); if reg.get_class() != RegClass::V128 { // We can't do any better. @@ -302,7 +302,14 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String { if reg.is_real() { // Change (eg) "v0" into "d0". if reg.get_class() == RegClass::V128 && s.starts_with("v") { - s.replace_range(0..1, "d"); + let replacement = match ty { + I64 | F64 => "d", + I8X16 => "b", + I16X8 => "h", + I32X4 => "s", + _ => unimplemented!(), + }; + s.replace_range(0..1, replacement); } } else { // Add a "d" suffix to RegClass::V128 vregs. diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index a97eab76e7..e77c641630 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1540,12 +1540,58 @@ pub(crate) fn lower_insn_to_regs>( ctx.emit(inst); } + Opcode::VanyTrue | Opcode::VallTrue => { + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let tmp = ctx.alloc_tmp(RegClass::V128, ty.unwrap()); + + // This operation is implemented by using umaxp or uminv to + // create a scalar value, which is then compared against zero. + // + // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b + // mov xm, vn.d[0] + // cmp xm, #0 + // cset xm, ne + + let input_ty = ctx.input_ty(insn, 0); + if op == Opcode::VanyTrue { + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Umaxp, + rd: tmp, + rn: rm, + rm: rm, + ty: input_ty, + }); + } else { + ctx.emit(Inst::VecLanes { + op: VecLanesOp::Uminv, + rd: tmp, + rn: rm, + ty: input_ty, + }); + }; + + ctx.emit(Inst::MovFromVec { + rd, + rn: tmp.to_reg(), + idx: 0, + ty: I64, + }); + + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::SubS64, + rd: writable_zero_reg(), + rn: rd.to_reg(), + imm12: Imm12::zero(), + }); + + ctx.emit(Inst::CSet { rd, cond: Cond::Ne }); + } + Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat | Opcode::Vselect - | Opcode::VanyTrue - | Opcode::VallTrue | Opcode::Insertlane | Opcode::ScalarToVector | Opcode::Swizzle