diff --git a/build.rs b/build.rs index a6e2b7d8bc..28b1ca3636 100644 --- a/build.rs +++ b/build.rs @@ -179,7 +179,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { _ => (), }, "Cranelift" => match (testsuite, testname) { - // All simd tests are known to fail on aarch64 for now, it's going + ("simd", "simd_store") => return false, + // Most simd tests are known to fail on aarch64 for now, it's going // to be a big chunk of work to implement them all there! ("simd", _) if target.contains("aarch64") => return true, diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 8f388665b5..23b824fdb7 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -280,7 +280,7 @@ fn in_int_reg(ty: ir::Type) -> bool { fn in_vec_reg(ty: ir::Type) -> bool { match ty { - types::F32 | types::F64 => true, + types::F32 | types::F64 | types::I8X16 => true, _ => false, } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 4b8142fbe5..3648eba2d0 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -480,11 +480,14 @@ impl ShowWithRRU for BranchTarget { } /// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and -/// 64-bit variants of many instructions (and integer registers). +/// 64-bit variants of many instructions (and integer and floating-point registers) and 128-bit +/// variants of vector instructions. +/// TODO: Create a separate type for SIMD & floating-point operands. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum InstSize { Size32, Size64, + Size128, } impl InstSize { @@ -507,11 +510,13 @@ impl InstSize { /// Convert from a needed width to the smallest size that fits. pub fn from_bits>(bits: I) -> InstSize { let bits: usize = bits.into(); - assert!(bits <= 64); + assert!(bits <= 128); if bits <= 32 { InstSize::Size32 - } else { + } else if bits <= 64 { InstSize::Size64 + } else { + InstSize::Size128 } } @@ -520,11 +525,12 @@ impl InstSize { Self::from_bits(ty_bits(ty)) } - /// Convert to I32 or I64. + /// Convert to I32, I64, or I128. pub fn to_ty(self) -> Type { match self { InstSize::Size32 => I32, InstSize::Size64 => I64, + InstSize::Size128 => I128, } } @@ -532,6 +538,9 @@ impl InstSize { match self { InstSize::Size32 => 0, InstSize::Size64 => 1, + _ => { + panic!("Unexpected size"); + } } } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 1bf59814ba..c0cbdd1f25 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -295,8 +295,8 @@ fn enc_ccmp_imm(size: InstSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) -> } fn enc_vecmov(is_16b: bool, rd: Writable, rn: Reg) -> u32 { - debug_assert!(!is_16b); // to be supported later. 0b00001110_101_00000_00011_1_00000_00000 + | ((is_16b as u32) << 30) | machreg_to_vec(rd.to_reg()) | (machreg_to_vec(rn) << 16) | (machreg_to_vec(rn) << 5) @@ -918,6 +918,9 @@ impl MachInstEmit for Inst { &Inst::FpuMove64 { rd, rn } => { sink.put4(enc_vecmov(/* 16b = */ false, rd, rn)); } + &Inst::FpuMove128 { rd, rn } => { + sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); + } &Inst::FpuRR { fpu_op, rd, rn } => { let top22 = match fpu_op { FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000, @@ -1073,6 +1076,22 @@ impl MachInstEmit for Inst { inst.emit(sink, flags, state); sink.put8(const_data.to_bits()); } + &Inst::LoadFpuConst128 { rd, const_data } => { + let inst = Inst::FpuLoad128 { + rd, + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }; + inst.emit(sink, flags, state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(20), + }; + inst.emit(sink, flags, state); + + for i in const_data.to_le_bytes().iter() { + sink.put1(*i); + } + } &Inst::FpuCSel32 { rd, rn, rm, cond } => { sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size32)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index b948f4fd8c..1dd6be20eb 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2164,6 +2164,15 @@ fn test_aarch64_binemit() { "mov v8.8b, v4.8b", )); + insns.push(( + Inst::FpuMove128 { + rd: writable_vreg(17), + rn: vreg(26), + }, + "511FBA4E", + "mov v17.16b, v26.16b", + )); + insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs32, @@ -2726,6 +2735,15 @@ fn test_aarch64_binemit() { "ldr d16, pc+8 ; b 12 ; data.f64 1", )); + insns.push(( + Inst::LoadFpuConst128 { + rd: writable_vreg(5), + const_data: 0x0f0e0d0c0b0a09080706050403020100, + }, + "4500009C05000014000102030405060708090A0B0C0D0E0F", + "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100", + )); + insns.push(( Inst::FpuCSel32 { rd: writable_vreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 989ce96f6c..1cf307d1d0 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -4,7 +4,9 @@ #![allow(dead_code)] use crate::binemit::CodeOffset; -use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS}; +use crate::ir::types::{ + B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS, +}; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::machinst::*; use crate::{settings, CodegenError, CodegenResult}; @@ -470,6 +472,12 @@ pub enum Inst { rn: Reg, }, + /// Vector register move. + FpuMove128 { + rd: Writable, + rn: Reg, + }, + /// 1-op FPU instruction. FpuRR { fpu_op: FPUOp1, @@ -559,6 +567,11 @@ pub enum Inst { const_data: f64, }, + LoadFpuConst128 { + rd: Writable, + const_data: u128, + }, + /// Conversion: FP -> integer. FpuToInt { op: FpuToIntOp, @@ -816,6 +829,11 @@ impl Inst { rd: to_reg, rm: from_reg, } + } else if from_reg.get_class() == RegClass::V128 { + Inst::FpuMove128 { + rd: to_reg, + rn: from_reg, + } } else { Inst::FpuMove64 { rd: to_reg, @@ -905,6 +923,14 @@ impl Inst { const_data: value, } } + + /// Create an instruction that loads a 128-bit vector constant. + pub fn load_fp_constant128(rd: Writable, value: u128) -> Inst { + Inst::LoadFpuConst128 { + rd, + const_data: value, + } + } } //============================================================================= @@ -1044,6 +1070,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + &Inst::FpuMove128 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuRR { rd, rn, .. } => { collector.add_def(rd); collector.add_use(rn); @@ -1094,7 +1124,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rd); memarg_regs(mem, collector); } - &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + &Inst::LoadFpuConst32 { rd, .. } + | &Inst::LoadFpuConst64 { rd, .. } + | &Inst::LoadFpuConst128 { rd, .. } => { collector.add_def(rd); } &Inst::FpuToInt { rd, rn, .. } => { @@ -1490,6 +1522,13 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::FpuMove128 { + ref mut rd, + ref mut rn, + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuRR { ref mut rd, ref mut rn, @@ -1596,6 +1635,9 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { &mut Inst::LoadFpuConst64 { ref mut rd, .. } => { map_def(mapper, rd); } + &mut Inst::LoadFpuConst128 { ref mut rd, .. } => { + map_def(mapper, rd); + } &mut Inst::FpuToInt { ref mut rd, ref mut rn, @@ -1780,6 +1822,7 @@ impl MachInst for Inst { match self { &Inst::Mov { rd, rm } => Some((rd, rm)), &Inst::FpuMove64 { rd, rn } => Some((rd, rn)), + &Inst::FpuMove128 { rd, rn } => Some((rd, rn)), _ => None, } } @@ -1813,7 +1856,7 @@ impl MachInst for Inst { } fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { - assert!(ty.bits() <= 64); // no vector support yet! + assert!(ty.bits() <= 128); Inst::mov(to_reg, from_reg) } @@ -1865,6 +1908,7 @@ impl MachInst for Inst { I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64), F32 | F64 => Ok(RegClass::V128), IFLAGS | FFLAGS => Ok(RegClass::I64), + I8X16 => Ok(RegClass::V128), _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", ty @@ -2235,6 +2279,11 @@ impl ShowWithRRU for Inst { let rn = rn.show_rru(mb_rru); format!("mov {}.8b, {}.8b", rd, rn) } + &Inst::FpuMove128 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}.16b, {}.16b", rd, rn) + } &Inst::FpuRR { fpu_op, rd, rn } => { let (op, sizesrc, sizedest) = match fpu_op { FPUOp1::Abs32 => ("fabs", InstSize::Size32, InstSize::Size32), @@ -2360,6 +2409,10 @@ impl ShowWithRRU for Inst { let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64); format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data) } + &Inst::LoadFpuConst128 { rd, const_data } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size128); + format!("ldr {}, pc+8 ; b 20 ; data.f128 0x{:032x}", rd, const_data) + } &Inst::FpuToInt { op, rd, rn } => { let (op, sizesrc, sizedest) = match op { FpuToIntOp::F32ToI32 => ("fcvtzs", InstSize::Size32, InstSize::Size32), diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index 242fb66fc9..7e13e33ac8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -276,13 +276,17 @@ pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz s } -/// Show a vector register when its use as a 32-bit or 64-bit float is known. +/// Show a vector register. pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String { let mut s = reg.show_rru(mb_rru); if reg.get_class() != RegClass::V128 { return s; } - let prefix = if size.is32() { "s" } else { "d" }; + let prefix = match size { + InstSize::Size32 => "s", + InstSize::Size64 => "d", + InstSize::Size128 => "q", + }; s.replace_range(0..1, prefix); s } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index eb4aafd551..68ad4017e1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -142,6 +142,31 @@ pub(crate) fn input_to_shiftimm>( input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift) } +pub(crate) fn output_to_const_f128>( + ctx: &mut C, + out: InsnOutput, +) -> Option { + if out.output > 0 { + None + } else { + let inst_data = ctx.data(out.insn); + + match inst_data { + &InstructionData::UnaryConst { + opcode: _, + constant_handle, + } => { + let mut bytes = [0u8; 16]; + let c = ctx.get_constant_data(constant_handle).clone().into_vec(); + assert_eq!(c.len(), 16); + bytes.copy_from_slice(&c); + Some(u128::from_le_bytes(bytes)) + } + _ => None, + } + } +} + /// How to handle narrow values loaded into registers; see note on `narrow_mode` /// parameter to `input_to_*` below. #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -588,6 +613,14 @@ pub(crate) fn lower_constant_f64>( ctx.emit(Inst::load_fp_constant64(rd, value)); } +pub(crate) fn lower_constant_f128>( + ctx: &mut C, + rd: Writable, + value: u128, +) { + ctx.emit(Inst::load_fp_constant128(rd, value)); +} + pub(crate) fn lower_condcode(cc: IntCC) -> Cond { match cc { IntCC::Equal => Cond::Eq, @@ -679,6 +712,7 @@ pub fn ty_bits(ty: Type) -> usize { B64 | I64 | F64 => 64, B128 | I128 => 128, IFLAGS | FFLAGS => 32, + I8X16 => 128, _ => panic!("ty_bits() on unknown type: {:?}", ty), } } @@ -686,7 +720,7 @@ pub fn ty_bits(ty: Type) -> usize { pub(crate) fn ty_is_int(ty: Type) -> bool { match ty { B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true, - F32 | F64 | B128 | I128 => false, + F32 | F64 | B128 | I128 | I8X16 => false, IFLAGS | FFLAGS => panic!("Unexpected flags type"), _ => panic!("ty_is_int() on unknown type: {:?}", ty), } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2946e16471..2faa66941f 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -875,6 +875,7 @@ pub(crate) fn lower_insn_to_regs>( (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc }, (64, _, false) => Inst::ULoad64 { rd, mem, srcloc }, (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc }, + (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc }, _ => panic!("Unsupported size in load"), }); } @@ -914,6 +915,7 @@ pub(crate) fn lower_insn_to_regs>( (32, true) => Inst::FpuStore32 { rd, mem, srcloc }, (64, false) => Inst::Store64 { rd, mem, srcloc }, (64, true) => Inst::FpuStore64 { rd, mem, srcloc }, + (128, _) => Inst::FpuStore128 { rd, mem, srcloc }, _ => panic!("Unsupported size in store"), }); } @@ -1342,8 +1344,13 @@ pub(crate) fn lower_insn_to_regs>( panic!("Branch opcode reached non-branch lowering logic!"); } - Opcode::Vconst - | Opcode::Shuffle + Opcode::Vconst => { + let value = output_to_const_f128(ctx, outputs[0]).unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + lower_constant_f128(ctx, rd, value); + } + + Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat | Opcode::Vselect diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 76663450ba..fabfdecc6a 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -7,8 +7,8 @@ use crate::fx::{FxHashMap, FxHashSet}; use crate::inst_predicates::{has_side_effect_or_load, is_constant_64bit}; use crate::ir::instructions::BranchInfo; use crate::ir::{ - ArgumentExtension, Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, - MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef, + ArgumentExtension, Block, Constant, ConstantData, ExternalName, Function, GlobalValueData, + Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef, }; use crate::machinst::{ ABIBody, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder, @@ -145,6 +145,8 @@ pub trait LowerCtx { /// `get_input()`. Codegen may not happen otherwise for the producing /// instruction if it has no side effects and no uses. fn use_input_reg(&mut self, input: LowerInput); + /// Retrieve constant data given a handle. + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData; } /// A representation of all of the ways in which an instruction input is @@ -913,6 +915,10 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { debug!("use_input_reg: vreg {:?} is needed", input.reg); self.vreg_needed[input.reg.get_index()] = true; } + + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData { + self.f.dfg.constants.get(constant_handle) + } } /// Visit all successors of a block with a given visitor closure.