From 62e7b7f8382505d4b1d0a000c2ad89b3e7d76a02 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 2 Jul 2020 13:17:33 +0100 Subject: [PATCH] arm64: Implement basic SIMD arithmetic Copyright (c) 2020, Arm Limited. --- build.rs | 3 + .../codegen/src/isa/aarch64/inst/emit.rs | 29 ++- .../src/isa/aarch64/inst/emit_tests.rs | 176 ++++++++++++++++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 21 ++- .../codegen/src/isa/aarch64/lower_inst.rs | 78 ++++++-- 5 files changed, 277 insertions(+), 30 deletions(-) diff --git a/build.rs b/build.rs index fa89812ed9..6e3a93502f 100644 --- a/build.rs +++ b/build.rs @@ -186,8 +186,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", "simd_boolean") => return false, ("simd", "simd_f32x4_cmp") => return false, ("simd", "simd_f64x2_cmp") => return false, + ("simd", "simd_i8x16_arith") => return false, ("simd", "simd_i8x16_cmp") => return false, + ("simd", "simd_i16x8_arith") => return false, ("simd", "simd_i16x8_cmp") => return false, + ("simd", "simd_i32x4_arith") => return false, ("simd", "simd_i32x4_cmp") => return false, ("simd", "simd_load_extend") => return false, ("simd", "simd_load_splat") => return false, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index abb9aa0045..a075401555 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -355,10 +355,11 @@ fn enc_fround(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) } -fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { +fn enc_vec_rr_misc(size: u32, bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { + debug_assert_eq!(size & 0b11, size); debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16); let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000; - bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg()) + bits | size << 22 | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg()) } fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable, rn: Reg) -> u32 { @@ -1067,13 +1068,24 @@ impl MachInstEmit for Inst { sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, ty } => { - let bits_12_16 = match op { + let enc_size = match ty { + I8X16 => 0b00, + I16X8 => 0b01, + I32X4 => 0b10, + I64X2 => 0b11, + _ => 0, + }; + let (bits_12_16, size) = match op { VecMisc2::Not => { debug_assert_eq!(128, ty_bits(ty)); - 0b00101 + (0b00101, 0b00) + } + VecMisc2::Neg => { + debug_assert_eq!(128, ty_bits(ty)); + (0b01011, enc_size) } }; - sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn)); + sink.put4(enc_vec_rr_misc(size, bits_12_16, rd, rn)); } &Inst::VecLanes { op, rd, rn, ty } => { let (q, size) = match ty { @@ -1277,6 +1289,7 @@ impl MachInstEmit for Inst { I8X16 => 0b00, I16X8 => 0b01, I32X4 => 0b10, + I64X2 => 0b11, _ => 0, }; let enc_size_for_fcmp = match ty { @@ -1333,6 +1346,12 @@ impl MachInstEmit for Inst { (0b011_01110_01_1, 0b000111) } VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001), + VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Mul => { + debug_assert_ne!(I64X2, ty); + (0b010_01110_00_1 | enc_size << 1, 0b100111) + } }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index aaf4cfbae3..01786f13af 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2341,6 +2341,138 @@ fn test_aarch64_binemit() { "umaxp v1.4s, v20.4s, v16.4s", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + ty: I8X16, + }, + "2584214E", + "add v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(7), + rn: vreg(13), + rm: vreg(2), + ty: I16X8, + }, + "A785624E", + "add v7.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(18), + rn: vreg(9), + rm: vreg(6), + ty: I32X4, + }, + "3285A64E", + "add v18.4s, v9.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Add, + rd: writable_vreg(1), + rn: vreg(3), + rm: vreg(2), + ty: I64X2, + }, + "6184E24E", + "add v1.2d, v3.2d, v2.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + ty: I8X16, + }, + "2584216E", + "sub v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(7), + rn: vreg(13), + rm: vreg(2), + ty: I16X8, + }, + "A785626E", + "sub v7.8h, v13.8h, v2.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(18), + rn: vreg(9), + rm: vreg(6), + ty: I32X4, + }, + "3285A66E", + "sub v18.4s, v9.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sub, + rd: writable_vreg(18), + rn: vreg(0), + rm: vreg(8), + ty: I64X2, + }, + "1284E86E", + "sub v18.2d, v0.2d, v8.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(25), + rn: vreg(9), + rm: vreg(8), + ty: I8X16, + }, + "399D284E", + "mul v25.16b, v9.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(30), + rn: vreg(30), + rm: vreg(12), + ty: I16X8, + }, + "DE9F6C4E", + "mul v30.8h, v30.8h, v12.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd: writable_vreg(18), + rn: vreg(18), + rm: vreg(18), + ty: I32X4, + }, + "529EB24E", + "mul v18.4s, v18.4s, v18.4s", + )); + insns.push(( Inst::VecMisc { op: VecMisc2::Not, @@ -2352,6 +2484,50 @@ fn test_aarch64_binemit() { "mvn v2.16b, v1.16b", )); + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(8), + rn: vreg(12), + ty: I8X16, + }, + "88B9206E", + "neg v8.16b, v12.16b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(0), + rn: vreg(31), + ty: I16X8, + }, + "E0BB606E", + "neg v0.8h, v31.8h", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(2), + rn: vreg(3), + ty: I32X4, + }, + "62B8A06E", + "neg v2.4s, v3.4s", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Neg, + rd: writable_vreg(10), + rn: vreg(8), + ty: I64X2, + }, + "0AB9E06E", + "neg v10.2d, v8.2d", + )); + insns.push(( Inst::VecLanes { op: VecLanesOp::Uminv, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 1a5563d62a..9d229b8df8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -243,13 +243,21 @@ pub enum VecALUOp { Bsl, /// Unsigned maximum pairwise Umaxp, + /// Add + Add, + /// Subtract + Sub, + /// Multiply + Mul, } /// A Vector miscellaneous operation with two registers. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum VecMisc2 { - /// Bitwise NOT. + /// Bitwise NOT Not, + /// Negate + Neg, } /// An operation across the lanes of vectors. @@ -2737,6 +2745,9 @@ impl ShowWithRRU for Inst { VecALUOp::Eor => ("eor", true, I8X16), VecALUOp::Bsl => ("bsl", true, I8X16), VecALUOp::Umaxp => ("umaxp", true, ty), + VecALUOp::Add => ("add", true, ty), + VecALUOp::Sub => ("sub", true, ty), + VecALUOp::Mul => ("mul", true, ty), }; let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector { @@ -2750,14 +2761,10 @@ impl ShowWithRRU for Inst { let rm = show_vreg_fn(rm, mb_rru, ty); format!("{} {}, {}, {}", op, rd, rn, rm) } - &Inst::VecMisc { - op, - rd, - rn, - ty: _ty, - } => { + &Inst::VecMisc { op, rd, rn, ty } => { let (op, ty) = match op { VecMisc2::Not => ("mvn", I8X16), + VecMisc2::Neg => ("neg", ty), }; let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty); diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 82eb35f13f..5c77a6a52d 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -58,18 +58,40 @@ pub(crate) fn lower_insn_to_regs>( Opcode::Iadd => { let rd = get_output_reg(ctx, outputs[0]); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); let ty = ty.unwrap(); - let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64); - ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + if ty_bits(ty) < 128 { + let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } else { + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op: VecALUOp::Add, + ty, + }); + } } Opcode::Isub => { let rd = get_output_reg(ctx, outputs[0]); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); let ty = ty.unwrap(); - let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); - ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + if ty_bits(ty) < 128 { + let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } else { + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op: VecALUOp::Sub, + ty, + }); + } } Opcode::UaddSat | Opcode::SaddSat => { // We use the vector instruction set's saturating adds (UQADD / @@ -143,11 +165,21 @@ pub(crate) fn lower_insn_to_regs>( Opcode::Ineg => { let rd = get_output_reg(ctx, outputs[0]); - let rn = zero_reg(); - let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None); let ty = ty.unwrap(); - let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); - ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + if ty_bits(ty) < 128 { + let rn = zero_reg(); + let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None); + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::VecMisc { + op: VecMisc2::Neg, + rd, + rn, + ty, + }); + } } Opcode::Imul => { @@ -155,14 +187,24 @@ pub(crate) fn lower_insn_to_regs>( let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let ty = ty.unwrap(); - let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64); - ctx.emit(Inst::AluRRRR { - alu_op, - rd, - rn, - rm, - ra: zero_reg(), - }); + if ty_bits(ty) < 128 { + let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64); + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra: zero_reg(), + }); + } else { + ctx.emit(Inst::VecRRR { + alu_op: VecALUOp::Mul, + rd, + rn, + rm, + ty, + }); + } } Opcode::Umulhi | Opcode::Smulhi => {