diff --git a/build.rs b/build.rs index 3841b85303..27ab619076 100644 --- a/build.rs +++ b/build.rs @@ -180,6 +180,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { }, "Cranelift" => match (testsuite, testname) { ("simd", "simd_store") => return false, + ("simd", "simd_i8x16_cmp") => return false, // Most simd tests are known to fail on aarch64 for now, it's going // to be a big chunk of work to implement them all there! ("simd", _) if target.contains("aarch64") => return true, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index c0cbdd1f25..4d257aee4f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -342,6 +342,12 @@ fn enc_fround(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) } +fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable, rn: Reg) -> u32 { + debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16); + let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000; + bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg()) +} + /// State carried between emissions of a sequence of instructions. #[derive(Default, Clone, Debug)] pub struct EmitState { @@ -1002,6 +1008,15 @@ impl MachInstEmit for Inst { }; sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } + &Inst::VecMisc { op, rd, rn, ty } => { + let bits_12_16 = match op { + VecMisc2::Not => { + debug_assert_eq!(I8X16, ty); + 0b00101 + } + }; + sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn)); + } &Inst::FpuCmp32 { rn, rm } => { sink.put4(enc_fcmp(InstSize::Size32, rn, rm)); } @@ -1125,12 +1140,40 @@ impl MachInstEmit for Inst { | machreg_to_gpr(rd.to_reg()), ); } - &Inst::VecRRR { rd, rn, rm, alu_op } => { + &Inst::VecRRR { + rd, + rn, + rm, + alu_op, + ty, + } => { + let enc_size_for_cmp = match ty { + I8X16 => 0b00, + _ => 0, + }; + let (top11, bit15_10) = match alu_op { - VecALUOp::SQAddScalar => (0b010_11110_11_1, 0b000011), - VecALUOp::SQSubScalar => (0b010_11110_11_1, 0b001011), - VecALUOp::UQAddScalar => (0b011_11110_11_1, 0b000011), - VecALUOp::UQSubScalar => (0b011_11110_11_1, 0b001011), + VecALUOp::SQAddScalar => { + debug_assert_eq!(I64, ty); + (0b010_11110_11_1, 0b000011) + } + VecALUOp::SQSubScalar => { + debug_assert_eq!(I64, ty); + (0b010_11110_11_1, 0b001011) + } + VecALUOp::UQAddScalar => { + debug_assert_eq!(I64, ty); + (0b011_11110_11_1, 0b000011) + } + VecALUOp::UQSubScalar => { + debug_assert_eq!(I64, ty); + (0b011_11110_11_1, 0b001011) + } + VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011), + VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111), + VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101), + VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101), + VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111), }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 1dd6be20eb..8507100401 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1802,6 +1802,7 @@ fn test_aarch64_binemit() { rn: vreg(22), rm: vreg(23), alu_op: VecALUOp::UQAddScalar, + ty: I64, }, "D50EF77E", "uqadd d21, d22, d23", @@ -1812,6 +1813,7 @@ fn test_aarch64_binemit() { rn: vreg(22), rm: vreg(23), alu_op: VecALUOp::SQAddScalar, + ty: I64, }, "D50EF75E", "sqadd d21, d22, d23", @@ -1822,6 +1824,7 @@ fn test_aarch64_binemit() { rn: vreg(22), rm: vreg(23), alu_op: VecALUOp::UQSubScalar, + ty: I64, }, "D52EF77E", "uqsub d21, d22, d23", @@ -1832,10 +1835,83 @@ fn test_aarch64_binemit() { rn: vreg(22), rm: vreg(23), alu_op: VecALUOp::SQSubScalar, + ty: I64, }, "D52EF75E", "sqsub d21, d22, d23", )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmeq, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + ty: I8X16, + }, + "E38E386E", + "cmeq v3.16b, v23.16b, v24.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmgt, + rd: writable_vreg(3), + rn: vreg(23), + rm: vreg(24), + ty: I8X16, + }, + "E336384E", + "cmgt v3.16b, v23.16b, v24.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmge, + rd: writable_vreg(23), + rn: vreg(9), + rm: vreg(12), + ty: I8X16, + }, + "373D2C4E", + "cmge v23.16b, v9.16b, v12.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhi, + rd: writable_vreg(5), + rn: vreg(1), + rm: vreg(1), + ty: I8X16, + }, + "2534216E", + "cmhi v5.16b, v1.16b, v1.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Cmhs, + rd: writable_vreg(8), + rn: vreg(2), + rm: vreg(15), + ty: I8X16, + }, + "483C2F6E", + "cmhs v8.16b, v2.16b, v15.16b", + )); + + insns.push(( + Inst::VecMisc { + op: VecMisc2::Not, + rd: writable_vreg(2), + rn: vreg(1), + ty: I8X16, + }, + "2258206E", + "mvn v2.16b, v1.16b", + )); + insns.push(( Inst::Extend { rd: writable_xreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 1cf307d1d0..7818092565 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -5,7 +5,7 @@ use crate::binemit::CodeOffset; use crate::ir::types::{ - B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS, + B1, B16, B32, B64, B8, B8X16, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS, }; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::machinst::*; @@ -197,6 +197,23 @@ pub enum VecALUOp { SQSubScalar, /// Unsigned saturating subtract UQSubScalar, + /// Compare bitwise equal + Cmeq, + /// Compare signed greater than or equal + Cmge, + /// Compare signed greater than + Cmgt, + /// Compare unsigned higher + Cmhs, + /// Compare unsigned higher or same + Cmhi, +} + +/// A Vector miscellaneous operation with two registers. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecMisc2 { + /// Bitwise NOT. + Not, } /// An operation on the bits of a register. This can be paired with several instruction formats @@ -626,6 +643,15 @@ pub enum Inst { rd: Writable, rn: Reg, rm: Reg, + ty: Type, + }, + + /// Vector two register miscellaneous instruction. + VecMisc { + op: VecMisc2, + rd: Writable, + rn: Reg, + ty: Type, }, /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). @@ -1096,6 +1122,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rm); collector.add_use(ra); } + &Inst::VecMisc { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { collector.add_use(rn); collector.add_use(rm); @@ -1567,6 +1597,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_use(mapper, rm); map_use(mapper, ra); } + &mut Inst::VecMisc { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuCmp32 { ref mut rn, ref mut rm, @@ -1909,6 +1947,7 @@ impl MachInst for Inst { F32 | F64 => Ok(RegClass::V128), IFLAGS | FFLAGS => Ok(RegClass::I64), I8X16 => Ok(RegClass::V128), + B8X16 => Ok(RegClass::V128), _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", ty @@ -2482,18 +2521,45 @@ impl ShowWithRRU for Inst { let rn = rn.show_rru(mb_rru); format!("mov {}, {}.d[0]", rd, rn) } - &Inst::VecRRR { rd, rn, rm, alu_op } => { - let op = match alu_op { - VecALUOp::SQAddScalar => "sqadd", - VecALUOp::UQAddScalar => "uqadd", - VecALUOp::SQSubScalar => "sqsub", - VecALUOp::UQSubScalar => "uqsub", + &Inst::VecRRR { + rd, + rn, + rm, + alu_op, + ty, + } => { + let (op, vector) = match alu_op { + VecALUOp::SQAddScalar => ("sqadd", false), + VecALUOp::UQAddScalar => ("uqadd", false), + VecALUOp::SQSubScalar => ("sqsub", false), + VecALUOp::UQSubScalar => ("uqsub", false), + VecALUOp::Cmeq => ("cmeq", true), + VecALUOp::Cmge => ("cmge", true), + VecALUOp::Cmgt => ("cmgt", true), + VecALUOp::Cmhs => ("cmhs", true), + VecALUOp::Cmhi => ("cmhi", true), }; - let rd = show_vreg_scalar(rd.to_reg(), mb_rru); - let rn = show_vreg_scalar(rn, mb_rru); - let rm = show_vreg_scalar(rm, mb_rru); + + let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector { + |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty) + } else { + |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru) + }; + + let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty); + let rn = show_vreg_fn(rn, mb_rru, ty); + let rm = show_vreg_fn(rm, mb_rru, ty); format!("{} {}, {}, {}", op, rd, rn, rm) } + &Inst::VecMisc { op, rd, rn, ty } => { + let op = match op { + VecMisc2::Not => "mvn", + }; + + let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty); + let rn = show_vreg_vector(rn, mb_rru, ty); + format!("{} {}, {}", op, rd, rn) + } &Inst::MovToNZCV { rn } => { let rn = rn.show_rru(mb_rru); format!("msr nzcv, {}", rn) diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index 7e13e33ac8..cebcf6ec30 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -319,6 +319,7 @@ pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> let mut s = reg.show_rru(mb_rru); match ty { + I8X16 => s.push_str(".16b"), F32X2 => s.push_str(".2s"), _ => unimplemented!(), } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 68ad4017e1..10db3b1f07 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -277,6 +277,7 @@ pub(crate) fn input_to_reg>( tmp.to_reg() } (_, 64) => in_reg, + (_, 128) => in_reg, _ => panic!( "Unsupported input width: input ty {} bits {} mode {:?}", @@ -712,7 +713,7 @@ pub fn ty_bits(ty: Type) -> usize { B64 | I64 | F64 => 64, B128 | I128 => 128, IFLAGS | FFLAGS => 32, - I8X16 => 128, + I8X16 | B8X16 => 128, _ => panic!("ty_bits() on unknown type: {:?}", ty), } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2faa66941f..95bf050958 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -7,7 +7,7 @@ use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode}; use crate::machinst::lower::*; use crate::machinst::*; -use crate::CodegenResult; +use crate::{CodegenError, CodegenResult}; use crate::isa::aarch64::abi::*; use crate::isa::aarch64::inst::*; @@ -96,6 +96,7 @@ pub(crate) fn lower_insn_to_regs>( rn: va.to_reg(), rm: vb.to_reg(), alu_op, + ty: I64, }); ctx.emit(Inst::MovFromVec64 { rd, @@ -127,6 +128,7 @@ pub(crate) fn lower_insn_to_regs>( rn: va.to_reg(), rm: vb.to_reg(), alu_op, + ty: I64, }); ctx.emit(Inst::MovFromVec64 { rd, @@ -1152,12 +1154,66 @@ pub(crate) fn lower_insn_to_regs>( (false, true) => NarrowValueMode::SignExtend64, (false, false) => NarrowValueMode::ZeroExtend64, }; - let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); - let rn = input_to_reg(ctx, inputs[0], narrow_mode); - let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); - let rd = output_to_reg(ctx, outputs[0]); - ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); - ctx.emit(Inst::CondSet { cond, rd }); + + if ty_bits(ty) < 128 { + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); + ctx.emit(Inst::CondSet { cond, rd }); + } else { + if ty != I8X16 { + return Err(CodegenError::Unsupported(format!( + "unsupported simd type: {:?}", + ty + ))); + } + + let mut rn = input_to_reg(ctx, inputs[0], narrow_mode); + let mut rm = input_to_reg(ctx, inputs[1], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + + // 'Less than' operations are implemented by swapping + // the order of operands and using the 'greater than' + // instructions. + // 'Not equal' is implemented with 'equal' and inverting + // the result. + let (alu_op, swap) = match cond { + Cond::Eq => (VecALUOp::Cmeq, false), + Cond::Ne => (VecALUOp::Cmeq, false), + Cond::Ge => (VecALUOp::Cmge, false), + Cond::Gt => (VecALUOp::Cmgt, false), + Cond::Le => (VecALUOp::Cmge, true), + Cond::Lt => (VecALUOp::Cmgt, true), + Cond::Hs => (VecALUOp::Cmhs, false), + Cond::Hi => (VecALUOp::Cmhi, false), + Cond::Ls => (VecALUOp::Cmhs, true), + Cond::Lo => (VecALUOp::Cmhi, true), + _ => unreachable!(), + }; + + if swap { + std::mem::swap(&mut rn, &mut rm); + } + + ctx.emit(Inst::VecRRR { + alu_op, + rd, + rn, + rm, + ty, + }); + + if cond == Cond::Ne { + ctx.emit(Inst::VecMisc { + op: VecMisc2::Not, + rd, + rn: rd.to_reg(), + ty: I8X16, + }); + } + } } Opcode::Fcmp => { @@ -1350,6 +1406,13 @@ pub(crate) fn lower_insn_to_regs>( lower_constant_f128(ctx, rd, value); } + Opcode::RawBitcast => { + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rm, ty)); + } + Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat @@ -1359,7 +1422,6 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Splat | Opcode::Insertlane | Opcode::Extractlane - | Opcode::RawBitcast | Opcode::ScalarToVector | Opcode::Swizzle | Opcode::Uload8x8