diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 1aac4be2fd..2160c83e4f 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -1,6 +1,7 @@ //! Instruction predicates/properties, shared by various analyses. use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode}; +use crate::machinst::ty_bits; use cranelift_entity::EntityRef; /// Preserve instructions with used result values. @@ -59,7 +60,21 @@ pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option { &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64), &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64), &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()), - &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }), + &InstructionData::UnaryBool { imm, .. } => { + let imm = if imm { + let bits = ty_bits(func.dfg.value_type(func.dfg.inst_results(inst)[0])); + + if bits < 64 { + (1u64 << bits) - 1 + } else { + u64::MAX + } + } else { + 0 + }; + + Some(imm) + } _ => None, } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 95bf4bb63f..6c13a0fd7d 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -609,10 +609,27 @@ pub enum VectorSize { } impl VectorSize { + /// Get the vector operand size with the given scalar size as lane size. + pub fn from_lane_size(size: ScalarSize, is_128bit: bool) -> VectorSize { + match (size, is_128bit) { + (ScalarSize::Size8, false) => VectorSize::Size8x8, + (ScalarSize::Size8, true) => VectorSize::Size8x16, + (ScalarSize::Size16, false) => VectorSize::Size16x4, + (ScalarSize::Size16, true) => VectorSize::Size16x8, + (ScalarSize::Size32, false) => VectorSize::Size32x2, + (ScalarSize::Size32, true) => VectorSize::Size32x4, + (ScalarSize::Size64, true) => VectorSize::Size64x2, + _ => panic!("Unexpected scalar FP operand size: {:?}", size), + } + } + /// Convert from a type into a vector operand size. pub fn from_ty(ty: Type) -> VectorSize { match ty { + B8X16 => VectorSize::Size8x16, + B16X8 => VectorSize::Size16x8, B32X4 => VectorSize::Size32x4, + B64X2 => VectorSize::Size64x2, F32X2 => VectorSize::Size32x2, F32X4 => VectorSize::Size32x4, F64X2 => VectorSize::Size64x2, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index e76fb61c6a..9b8da0879a 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -437,6 +437,21 @@ fn enc_stxr(ty: Type, rs: Writable, rt: Reg, rn: Reg) -> u32 { | machreg_to_gpr(rt) } +fn enc_asimd_mod_imm(rd: Writable, q_op: u32, cmode: u32, imm: u8) -> u32 { + let abc = (imm >> 5) as u32; + let defgh = (imm & 0b11111) as u32; + + debug_assert_eq!(cmode & 0b1111, cmode); + debug_assert_eq!(q_op & 0b11, q_op); + + 0b0_0_0_0111100000_000_0000_01_00000_00000 + | (q_op << 29) + | (abc << 16) + | (cmode << 12) + | (defgh << 5) + | machreg_to_vec(rd.to_reg()) +} + /// State carried between emissions of a sequence of instructions. #[derive(Default, Clone, Debug)] pub struct EmitState { @@ -1588,19 +1603,6 @@ impl MachInstEmit for Inst { }; sink.put4(enc_inttofpu(top16, rd, rn)); } - &Inst::LoadFpuConst32 { rd, const_data } => { - let inst = Inst::FpuLoad32 { - rd, - mem: AMode::Label(MemLabel::PCRel(8)), - srcloc: None, - }; - inst.emit(sink, emit_info, state); - let inst = Inst::Jump { - dest: BranchTarget::ResolvedOffset(8), - }; - inst.emit(sink, emit_info, state); - sink.put4(const_data.to_bits()); - } &Inst::LoadFpuConst64 { rd, const_data } => { let inst = Inst::FpuLoad64 { rd, @@ -1612,7 +1614,7 @@ impl MachInstEmit for Inst { dest: BranchTarget::ResolvedOffset(12), }; inst.emit(sink, emit_info, state); - sink.put8(const_data.to_bits()); + sink.put8(const_data); } &Inst::LoadFpuConst128 { rd, const_data } => { let inst = Inst::FpuLoad128 { @@ -1751,6 +1753,53 @@ impl MachInstEmit for Inst { | machreg_to_vec(rd.to_reg()), ); } + &Inst::VecDupImm { + rd, + imm, + invert, + size, + } => { + let (imm, shift, shift_ones) = imm.value(); + let (op, cmode) = match size.lane_size() { + ScalarSize::Size8 => { + assert!(!invert); + assert_eq!(shift, 0); + + (0, 0b1110) + } + ScalarSize::Size16 => { + let s = shift & 8; + + assert!(!shift_ones); + assert_eq!(s, shift); + + (invert as u32, 0b1000 | (s >> 2)) + } + ScalarSize::Size32 => { + if shift_ones { + assert!(shift == 8 || shift == 16); + + (invert as u32, 0b1100 | (shift >> 4)) + } else { + let s = shift & 24; + + assert_eq!(s, shift); + + (invert as u32, 0b0000 | (s >> 2)) + } + } + ScalarSize::Size64 => { + assert!(!invert); + assert_eq!(shift, 0); + + (1, 0b1110) + } + _ => unreachable!(), + }; + let q_op = op | ((size.is_128bits() as u32) << 1); + + sink.put4(enc_asimd_mod_imm(rd, q_op, cmode, imm)); + } &Inst::VecExtend { t, rd, @@ -1803,8 +1852,8 @@ impl MachInstEmit for Inst { &Inst::VecMovElement { rd, rn, - idx1, - idx2, + dest_idx, + src_idx, size, } => { let (imm5, shift) = match size.lane_size() { @@ -1815,10 +1864,10 @@ impl MachInstEmit for Inst { _ => unreachable!(), }; let mask = 0b11111 >> shift; - debug_assert_eq!(idx1 & mask, idx1); - debug_assert_eq!(idx2 & mask, idx2); - let imm4 = (idx2 as u32) << (shift - 1); - let imm5 = imm5 | ((idx1 as u32) << shift); + debug_assert_eq!(dest_idx & mask, dest_idx); + debug_assert_eq!(src_idx & mask, src_idx); + let imm4 = (src_idx as u32) << (shift - 1); + let imm5 = imm5 | ((dest_idx as u32) << shift); sink.put4( 0b011_01110000_00000_0_0000_1_00000_00000 | (imm5 << 16) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index b9cf76e71c..c25ee43316 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2034,6 +2034,26 @@ fn test_aarch64_binemit() { "5205084E", "dup v18.2d, v10.d[0]", )); + insns.push(( + Inst::VecDupImm { + rd: writable_vreg(31), + imm: ASIMDMovModImm::maybe_from_u64(255, ScalarSize::Size8).unwrap(), + invert: false, + size: VectorSize::Size8x16, + }, + "FFE7074F", + "movi v31.16b, #255", + )); + insns.push(( + Inst::VecDupImm { + rd: writable_vreg(0), + imm: ASIMDMovModImm::zero(), + invert: true, + size: VectorSize::Size16x4, + }, + "0084002F", + "mvni v0.4h, #0", + )); insns.push(( Inst::VecExtend { t: VecExtendOp::Sxtl8, @@ -2099,8 +2119,8 @@ fn test_aarch64_binemit() { Inst::VecMovElement { rd: writable_vreg(0), rn: vreg(31), - idx1: 7, - idx2: 7, + dest_idx: 7, + src_idx: 7, size: VectorSize::Size16x8, }, "E0771E6E", @@ -2111,8 +2131,8 @@ fn test_aarch64_binemit() { Inst::VecMovElement { rd: writable_vreg(31), rn: vreg(16), - idx1: 1, - idx2: 0, + dest_idx: 1, + src_idx: 0, size: VectorSize::Size32x2, }, "1F060C6E", @@ -4781,19 +4801,10 @@ fn test_aarch64_binemit() { "str q16, [x8, x9, LSL #4]", )); - insns.push(( - Inst::LoadFpuConst32 { - rd: writable_vreg(16), - const_data: 1.0, - }, - "5000001C020000140000803F", - "ldr s16, pc+8 ; b 8 ; data.f32 1", - )); - insns.push(( Inst::LoadFpuConst64 { rd: writable_vreg(16), - const_data: 1.0, + const_data: 1.0_f64.to_bits(), }, "5000005C03000014000000000000F03F", "ldr d16, pc+8 ; b 12 ; data.f64 1", diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs index d8dd45afca..b6da0402bc 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -4,7 +4,7 @@ #[allow(dead_code)] use crate::ir::types::*; use crate::ir::Type; -use crate::isa::aarch64::inst::OperandSize; +use crate::isa::aarch64::inst::{OperandSize, ScalarSize}; use regalloc::{PrettyPrint, RealRegUniverse}; @@ -667,6 +667,40 @@ impl MoveWideConst { } } +/// Advanced SIMD modified immediate as used by MOVI/MVNI. +#[derive(Clone, Copy, Debug)] +pub struct ASIMDMovModImm { + imm: u8, + shift: u8, + shift_ones: bool, +} + +impl ASIMDMovModImm { + pub fn maybe_from_u64(value: u64, size: ScalarSize) -> Option { + match size { + ScalarSize::Size8 => Some(ASIMDMovModImm { + imm: value as u8, + shift: 0, + shift_ones: false, + }), + _ => None, + } + } + + /// Create a zero immediate of this format. + pub fn zero() -> Self { + ASIMDMovModImm { + imm: 0, + shift: 0, + shift_ones: false, + } + } + + pub fn value(&self) -> (u8, u32, bool) { + (self.imm, self.shift as u32, self.shift_ones) + } +} + impl PrettyPrint for NZCV { fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { let fmt = |c: char, v| if v { c.to_ascii_uppercase() } else { c }; @@ -746,6 +780,17 @@ impl PrettyPrint for MoveWideConst { } } +impl PrettyPrint for ASIMDMovModImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + if self.shift == 0 { + format!("#{}", self.imm) + } else { + let shift_type = if self.shift_ones { "MSL" } else { "LSL" }; + format!("#{}, {} #{}", self.imm, shift_type, self.shift) + } + } +} + #[cfg(test)] mod test { use super::*; diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 1670bbad36..d18b07c9fd 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -18,6 +18,7 @@ use regalloc::{RegUsageCollector, RegUsageMapper}; use alloc::boxed::Box; use alloc::vec::Vec; +use core::convert::TryFrom; use smallvec::{smallvec, SmallVec}; use std::string::{String, ToString}; @@ -826,14 +827,9 @@ pub enum Inst { srcloc: Option, }, - LoadFpuConst32 { - rd: Writable, - const_data: f32, - }, - LoadFpuConst64 { rd: Writable, - const_data: f64, + const_data: u64, }, LoadFpuConst128 { @@ -922,6 +918,14 @@ pub enum Inst { size: VectorSize, }, + /// Duplicate immediate to vector. + VecDupImm { + rd: Writable, + imm: ASIMDMovModImm, + invert: bool, + size: VectorSize, + }, + /// Vector extend. VecExtend { t: VecExtendOp, @@ -934,8 +938,8 @@ pub enum Inst { VecMovElement { rd: Writable, rn: Reg, - idx1: u8, - idx2: u8, + dest_idx: u8, + src_idx: u8, size: VectorSize, }, @@ -1297,29 +1301,146 @@ impl Inst { } } - /// Create an instruction that loads a 32-bit floating-point constant. - pub fn load_fp_constant32(rd: Writable, value: f32) -> Inst { - // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. - Inst::LoadFpuConst32 { - rd, - const_data: value, + /// Create instructions that load a 32-bit floating-point constant. + pub fn load_fp_constant32 Writable>( + rd: Writable, + value: u32, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + if value == 0 { + smallvec![Inst::VecDupImm { + rd, + imm: ASIMDMovModImm::zero(), + invert: false, + size: VectorSize::Size8x8 + }] + } else { + // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. + let tmp = alloc_tmp(RegClass::I64, I32); + let mut insts = Inst::load_constant(tmp, value as u64); + + insts.push(Inst::MovToFpu { + rd, + rn: tmp.to_reg(), + }); + + insts } } - /// Create an instruction that loads a 64-bit floating-point constant. - pub fn load_fp_constant64(rd: Writable, value: f64) -> Inst { - // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. - Inst::LoadFpuConst64 { - rd, - const_data: value, + /// Create instructions that load a 64-bit floating-point constant. + pub fn load_fp_constant64 Writable>( + rd: Writable, + const_data: u64, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { + if let Ok(const_data) = u32::try_from(const_data) { + Inst::load_fp_constant32(rd, const_data, alloc_tmp) + // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits. + // Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI + // might also be an option. + } else if const_data & (u32::MAX as u64) == 0 { + let tmp = alloc_tmp(RegClass::I64, I64); + let mut insts = Inst::load_constant(tmp, const_data); + + insts.push(Inst::MovToFpu { + rd, + rn: tmp.to_reg(), + }); + + insts + } else { + smallvec![Inst::LoadFpuConst64 { rd, const_data }] } } - /// Create an instruction that loads a 128-bit vector constant. - pub fn load_fp_constant128(rd: Writable, value: u128) -> Inst { - Inst::LoadFpuConst128 { - rd, - const_data: value, + /// Create instructions that load a 128-bit vector constant. + pub fn load_fp_constant128 Writable>( + rd: Writable, + const_data: u128, + alloc_tmp: F, + ) -> SmallVec<[Inst; 5]> { + if let Ok(const_data) = u64::try_from(const_data) { + SmallVec::from(&Inst::load_fp_constant64(rd, const_data, alloc_tmp)[..]) + } else if let Some((pattern, size)) = + Inst::get_replicated_vector_pattern(const_data, ScalarSize::Size64) + { + Inst::load_replicated_vector_pattern( + rd, + pattern, + VectorSize::from_lane_size(size, true), + alloc_tmp, + ) + } else { + smallvec![Inst::LoadFpuConst128 { rd, const_data }] + } + } + + /// Determine whether a 128-bit constant represents a vector consisting of elements with + /// the same value. + pub fn get_replicated_vector_pattern( + value: u128, + size: ScalarSize, + ) -> Option<(u64, ScalarSize)> { + let (mask, shift, next_size) = match size { + ScalarSize::Size8 => (u8::MAX as u128, 8, ScalarSize::Size128), + ScalarSize::Size16 => (u16::MAX as u128, 16, ScalarSize::Size8), + ScalarSize::Size32 => (u32::MAX as u128, 32, ScalarSize::Size16), + ScalarSize::Size64 => (u64::MAX as u128, 64, ScalarSize::Size32), + _ => return None, + }; + let mut r = None; + let v = value & mask; + + if (value >> shift) & mask == v { + r = Inst::get_replicated_vector_pattern(v, next_size); + + if r.is_none() { + r = Some((v as u64, size)); + } + } + + r + } + + /// Create instructions that load a 128-bit vector constant consisting of elements with + /// the same value. + pub fn load_replicated_vector_pattern Writable>( + rd: Writable, + pattern: u64, + size: VectorSize, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 5]> { + let lane_size = size.lane_size(); + + if let Some(imm) = ASIMDMovModImm::maybe_from_u64(pattern, lane_size) { + smallvec![Inst::VecDupImm { + rd, + imm, + invert: false, + size + }] + } else if let Some(imm) = ASIMDMovModImm::maybe_from_u64(!pattern, lane_size) { + debug_assert_ne!(lane_size, ScalarSize::Size8); + debug_assert_ne!(lane_size, ScalarSize::Size64); + + smallvec![Inst::VecDupImm { + rd, + imm, + invert: true, + size + }] + } else { + let tmp = alloc_tmp(RegClass::I64, I64); + let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]); + + insts.push(Inst::VecDup { + rd, + rn: tmp.to_reg(), + size, + }); + + insts } } @@ -1704,9 +1825,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rd); memarg_regs(mem, collector); } - &Inst::LoadFpuConst32 { rd, .. } - | &Inst::LoadFpuConst64 { rd, .. } - | &Inst::LoadFpuConst128 { rd, .. } => { + &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => { collector.add_def(rd); } &Inst::FpuToInt { rd, rn, .. } => { @@ -1746,6 +1865,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + &Inst::VecDupImm { rd, .. } => { + collector.add_def(rd); + } &Inst::VecExtend { rd, rn, .. } => { collector.add_def(rd); collector.add_use(rn); @@ -2344,9 +2466,6 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_use(mapper, rd); map_mem(mapper, mem); } - &mut Inst::LoadFpuConst32 { ref mut rd, .. } => { - map_def(mapper, rd); - } &mut Inst::LoadFpuConst64 { ref mut rd, .. } => { map_def(mapper, rd); } @@ -2441,6 +2560,9 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::VecDupImm { ref mut rd, .. } => { + map_def(mapper, rd); + } &mut Inst::VecExtend { ref mut rd, ref mut rn, @@ -2631,19 +2753,12 @@ impl MachInst for Inst { to_reg: Writable, value: u64, ty: Type, - _alloc_tmp: F, + alloc_tmp: F, ) -> SmallVec<[Inst; 4]> { if ty == F64 { - let mut ret = SmallVec::new(); - ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value))); - ret + Inst::load_fp_constant64(to_reg, value, alloc_tmp) } else if ty == F32 { - let mut ret = SmallVec::new(); - ret.push(Inst::load_fp_constant32( - to_reg, - f32::from_bits(value as u32), - )); - ret + Inst::load_fp_constant32(to_reg, value as u32, alloc_tmp) } else { // Must be an integer type. debug_assert!( @@ -3216,13 +3331,9 @@ impl Inst { let mem = mem.show_rru(mb_rru); format!("{}str {}, {}", mem_str, rd, mem) } - &Inst::LoadFpuConst32 { rd, const_data } => { - let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32); - format!("ldr {}, pc+8 ; b 8 ; data.f32 {}", rd, const_data) - } &Inst::LoadFpuConst64 { rd, const_data } => { let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); - format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data) + format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, f64::from_bits(const_data)) } &Inst::LoadFpuConst128 { rd, const_data } => { let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size128); @@ -3330,6 +3441,17 @@ impl Inst { let rn = show_vreg_element(rn, mb_rru, 0, size); format!("dup {}, {}", rd, rn) } + &Inst::VecDupImm { rd, imm, invert, size } => { + let imm = imm.show_rru(mb_rru); + let op = if invert { + "mvni" + } else { + "movi" + }; + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + + format!("{} {}, {}", op, rd, imm) + } &Inst::VecExtend { t, rd, rn, high_half } => { let (op, dest, src) = match (t, high_half) { (VecExtendOp::Sxtl8, false) => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8), @@ -3352,12 +3474,12 @@ impl Inst { &Inst::VecMovElement { rd, rn, - idx1, - idx2, + dest_idx, + src_idx, size, } => { - let rd = show_vreg_element(rd.to_reg(), mb_rru, idx1, size); - let rn = show_vreg_element(rn, mb_rru, idx2, size); + let rd = show_vreg_element(rd.to_reg(), mb_rru, dest_idx, size); + let rn = show_vreg_element(rn, mb_rru, src_idx, size); format!("mov {}, {}", rd, rn) } &Inst::VecMiscNarrow { op, rd, rn, size, high_half } => { diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 111c221a25..9549ef700a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -813,7 +813,11 @@ pub(crate) fn lower_constant_f32>( rd: Writable, value: f32, ) { - ctx.emit(Inst::load_fp_constant32(rd, value)); + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) { + ctx.emit(inst); + } } pub(crate) fn lower_constant_f64>( @@ -821,7 +825,11 @@ pub(crate) fn lower_constant_f64>( rd: Writable, value: f64, ) { - ctx.emit(Inst::load_fp_constant64(rd, value)); + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) { + ctx.emit(inst); + } } pub(crate) fn lower_constant_f128>( @@ -829,7 +837,38 @@ pub(crate) fn lower_constant_f128>( rd: Writable, value: u128, ) { - ctx.emit(Inst::load_fp_constant128(rd, value)); + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) { + ctx.emit(inst); + } +} + +pub(crate) fn lower_splat_const>( + ctx: &mut C, + rd: Writable, + value: u64, + size: VectorSize, +) { + let (value, narrow_size) = match size.lane_size() { + ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128), + ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8), + ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16), + ScalarSize::Size64 => (value, ScalarSize::Size32), + _ => unreachable!(), + }; + let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) { + Some((value, lane_size)) => ( + value, + VectorSize::from_lane_size(lane_size, size.is_128bits()), + ), + None => (value, size), + }; + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + + for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) { + ctx.emit(inst); + } } pub(crate) fn lower_condcode(cc: IntCC) -> Cond { diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index aa6b6d4aab..b6617835d5 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2013,24 +2013,47 @@ pub(crate) fn lower_insn_to_regs>( ctx.emit(Inst::VecMovElement { rd, rn, - idx1: idx, - idx2: 0, + dest_idx: idx, + src_idx: 0, size, }); } } Opcode::Splat => { - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rd = get_output_reg(ctx, outputs[0]); - let input_ty = ctx.input_ty(insn, 0); let size = VectorSize::from_ty(ty.unwrap()); - let inst = if ty_has_int_representation(input_ty) { - Inst::VecDup { rd, rn, size } + + if let Some((_, insn)) = maybe_input_insn_multi( + ctx, + inputs[0], + &[ + Opcode::Bconst, + Opcode::F32const, + Opcode::F64const, + Opcode::Iconst, + ], + ) { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some(insn) = + maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce) + { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); + } else if let Some(insn) = + maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce) + { + lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size); } else { - Inst::VecDupFromFpu { rd, rn, size } - }; - ctx.emit(inst); + let input_ty = ctx.input_ty(insn, 0); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let inst = if ty_has_int_representation(input_ty) { + Inst::VecDup { rd, rn, size } + } else { + Inst::VecDupFromFpu { rd, rn, size } + }; + + ctx.emit(inst); + } } Opcode::VanyTrue | Opcode::VallTrue => { @@ -2820,15 +2843,9 @@ pub(crate) fn lower_insn_to_regs>( let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty); if in_bits == 32 { - ctx.emit(Inst::LoadFpuConst32 { - rd: rtmp1, - const_data: max as f32, - }); + lower_constant_f32(ctx, rtmp1, max as f32); } else { - ctx.emit(Inst::LoadFpuConst64 { - rd: rtmp1, - const_data: max, - }); + lower_constant_f64(ctx, rtmp1, max); } ctx.emit(Inst::FpuRRR { fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64), @@ -2837,15 +2854,9 @@ pub(crate) fn lower_insn_to_regs>( rm: rtmp1.to_reg(), }); if in_bits == 32 { - ctx.emit(Inst::LoadFpuConst32 { - rd: rtmp1, - const_data: min as f32, - }); + lower_constant_f32(ctx, rtmp1, min as f32); } else { - ctx.emit(Inst::LoadFpuConst64 { - rd: rtmp1, - const_data: min, - }); + lower_constant_f64(ctx, rtmp1, min); } ctx.emit(Inst::FpuRRR { fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64), @@ -2855,15 +2866,9 @@ pub(crate) fn lower_insn_to_regs>( }); if out_signed { if in_bits == 32 { - ctx.emit(Inst::LoadFpuConst32 { - rd: rtmp1, - const_data: 0.0, - }); + lower_constant_f32(ctx, rtmp1, 0.0); } else { - ctx.emit(Inst::LoadFpuConst64 { - rd: rtmp1, - const_data: 0.0, - }); + lower_constant_f64(ctx, rtmp1, 0.0); } } if in_bits == 32 { diff --git a/cranelift/filetests/filetests/isa/aarch64/constants.clif b/cranelift/filetests/filetests/isa/aarch64/constants.clif index 48fa386891..80dce9e349 100644 --- a/cranelift/filetests/filetests/isa/aarch64/constants.clif +++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif @@ -9,7 +9,7 @@ block0: ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: movz x0, #1 +; nextln: movz x0, #255 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret diff --git a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif index 518c80e17a..4a88430b53 100644 --- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif +++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif @@ -60,10 +60,12 @@ block0(v0: f32): v1 = fcvt_to_uint.i8 v0 ; check: fcmp s0, s0 ; check: b.vc 8 ; udf - ; check: ldr s1, pc+8 ; b 8 ; data.f32 -1 + ; check: movz x0, #49024, LSL #16 + ; check: fmov d1, x0 ; check: fcmp s0, s1 ; check: b.gt 8 ; udf - ; check: ldr s1, pc+8 ; b 8 ; data.f32 256 + ; check: movz x0, #17280, LSL #16 + ; check: fmov d1, x0 ; check: fcmp s0, s1 ; check: b.mi 8 ; udf ; check: fcvtzu w0, s0 @@ -80,10 +82,12 @@ block0(v0: f64): v1 = fcvt_to_uint.i8 v0 ; check: fcmp d0, d0 ; check: b.vc 8 ; udf - ; check: ldr d1, pc+8 ; b 12 ; data.f64 -1 + ; check: movz x0, #49136, LSL #48 + ; check: fmov d1, x0 ; check: fcmp d0, d1 ; check: b.gt 8 ; udf - ; check: ldr d1, pc+8 ; b 12 ; data.f64 256 + ; check: movz x0, #16496, LSL #48 + ; check: fmov d1, x0 ; check: fcmp d0, d1 ; check: b.mi 8 ; udf ; check: fcvtzu w0, d0 @@ -100,10 +104,12 @@ block0(v0: f32): v1 = fcvt_to_uint.i16 v0 ; check: fcmp s0, s0 ; check: b.vc 8 ; udf - ; check: ldr s1, pc+8 ; b 8 ; data.f32 -1 + ; check: movz x0, #49024, LSL #16 + ; check: fmov d1, x0 ; check: fcmp s0, s1 ; check: b.gt 8 ; udf - ; check: ldr s1, pc+8 ; b 8 ; data.f32 65536 + ; check: movz x0, #18304, LSL #16 + ; check: fmov d1, x0 ; check: fcmp s0, s1 ; check: b.mi 8 ; udf ; check: fcvtzu w0, s0 @@ -120,10 +126,12 @@ block0(v0: f64): v1 = fcvt_to_uint.i16 v0 ; check: fcmp d0, d0 ; check: b.vc 8 ; udf - ; check: ldr d1, pc+8 ; b 12 ; data.f64 -1 + ; check: movz x0, #49136, LSL #48 + ; check: fmov d1, x0 ; check: fcmp d0, d1 ; check: b.gt 8 ; udf - ; check: ldr d1, pc+8 ; b 12 ; data.f64 65536 + ; check: movz x0, #16624, LSL #48 + ; check: fmov d1, x0 ; check: fcmp d0, d1 ; check: b.mi 8 ; udf ; check: fcvtzu w0, d0 diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif index 8303450b23..25f53ff4b1 100644 --- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif +++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif @@ -427,10 +427,12 @@ block0(v0: f32): ; nextln: mov fp, sp ; nextln: fcmp s0, s0 ; nextln: b.vc 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 -1 +; nextln: movz x0, #49024, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.gt 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 4294967300 +; nextln: movz x0, #20352, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzu w0, s0 @@ -448,10 +450,12 @@ block0(v0: f32): ; nextln: mov fp, sp ; nextln: fcmp s0, s0 ; nextln: b.vc 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 -2147483600 +; nextln: movz x0, #52992, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.ge 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 2147483600 +; nextln: movz x0, #20224, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzs w0, s0 @@ -469,10 +473,12 @@ block0(v0: f32): ; nextln: mov fp, sp ; nextln: fcmp s0, s0 ; nextln: b.vc 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 -1 +; nextln: movz x0, #49024, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.gt 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 18446744000000000000 +; nextln: movz x0, #24448, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzu x0, s0 @@ -490,10 +496,12 @@ block0(v0: f32): ; nextln: mov fp, sp ; nextln: fcmp s0, s0 ; nextln: b.vc 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 -9223372000000000000 +; nextln: movz x0, #57088, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.ge 8 ; udf -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 9223372000000000000 +; nextln: movz x0, #24320, LSL #16 +; nextln: fmov d1, x0 ; nextln: fcmp s0, s1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzs x0, s0 @@ -511,10 +519,12 @@ block0(v0: f64): ; nextln: mov fp, sp ; nextln: fcmp d0, d0 ; nextln: b.vc 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 -1 +; nextln: movz x0, #49136, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.gt 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 4294967296 +; nextln: movz x0, #16880, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzu w0, d0 @@ -535,7 +545,8 @@ block0(v0: f64): ; nextln: ldr d1, pc+8 ; b 12 ; data.f64 -2147483649 ; nextln: fcmp d0, d1 ; nextln: b.gt 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 2147483648 +; nextln: movz x0, #16864, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzs w0, d0 @@ -553,10 +564,12 @@ block0(v0: f64): ; nextln: mov fp, sp ; nextln: fcmp d0, d0 ; nextln: b.vc 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 -1 +; nextln: movz x0, #49136, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.gt 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 18446744073709552000 +; nextln: movz x0, #17392, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzu x0, d0 @@ -574,10 +587,12 @@ block0(v0: f64): ; nextln: mov fp, sp ; nextln: fcmp d0, d0 ; nextln: b.vc 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 -9223372036854776000 +; nextln: movz x0, #50144, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.ge 8 ; udf -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 9223372036854776000 +; nextln: movz x0, #17376, LSL #48 +; nextln: fmov d1, x0 ; nextln: fcmp d0, d1 ; nextln: b.mi 8 ; udf ; nextln: fcvtzs x0, d0 @@ -697,9 +712,10 @@ block0(v0: f32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 4294967300 +; nextln: movz x0, #20352, LSL #16 +; nextln: fmov d1, x0 ; nextln: fmin s2, s0, s1 -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 0 +; nextln: movi v1.8b, #0 ; nextln: fmax s2, s2, s1 ; nextln: fcmp s0, s0 ; nextln: fcsel s0, s1, s2, ne @@ -716,11 +732,13 @@ block0(v0: f32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 2147483600 +; nextln: movz x0, #20224, LSL #16 +; nextln: fmov d1, x0 ; nextln: fmin s1, s0, s1 -; nextln: ldr s2, pc+8 ; b 8 ; data.f32 -2147483600 +; nextln: movz x0, #52992, LSL #16 +; nextln: fmov d2, x0 ; nextln: fmax s1, s1, s2 -; nextln: ldr s2, pc+8 ; b 8 ; data.f32 0 +; nextln: movi v2.8b, #0 ; nextln: fcmp s0, s0 ; nextln: fcsel s0, s2, s1, ne ; nextln: fcvtzs w0, s0 @@ -736,9 +754,10 @@ block0(v0: f32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 18446744000000000000 +; nextln: movz x0, #24448, LSL #16 +; nextln: fmov d1, x0 ; nextln: fmin s2, s0, s1 -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 0 +; nextln: movi v1.8b, #0 ; nextln: fmax s2, s2, s1 ; nextln: fcmp s0, s0 ; nextln: fcsel s0, s1, s2, ne @@ -755,11 +774,13 @@ block0(v0: f32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr s1, pc+8 ; b 8 ; data.f32 9223372000000000000 +; nextln: movz x0, #24320, LSL #16 +; nextln: fmov d1, x0 ; nextln: fmin s1, s0, s1 -; nextln: ldr s2, pc+8 ; b 8 ; data.f32 -9223372000000000000 +; nextln: movz x0, #57088, LSL #16 +; nextln: fmov d2, x0 ; nextln: fmax s1, s1, s2 -; nextln: ldr s2, pc+8 ; b 8 ; data.f32 0 +; nextln: movi v2.8b, #0 ; nextln: fcmp s0, s0 ; nextln: fcsel s0, s2, s1, ne ; nextln: fcvtzs x0, s0 @@ -777,7 +798,7 @@ block0(v0: f64): ; nextln: mov fp, sp ; nextln: ldr d1, pc+8 ; b 12 ; data.f64 4294967295 ; nextln: fmin d2, d0, d1 -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 0 +; nextln: movi v1.8b, #0 ; nextln: fmax d2, d2, d1 ; nextln: fcmp d0, d0 ; nextln: fcsel d0, d1, d2, ne @@ -796,9 +817,10 @@ block0(v0: f64): ; nextln: mov fp, sp ; nextln: ldr d1, pc+8 ; b 12 ; data.f64 2147483647 ; nextln: fmin d1, d0, d1 -; nextln: ldr d2, pc+8 ; b 12 ; data.f64 -2147483648 +; nextln: movz x0, #49632, LSL #48 +; nextln: fmov d2, x0 ; nextln: fmax d1, d1, d2 -; nextln: ldr d2, pc+8 ; b 12 ; data.f64 0 +; nextln: movi v2.8b, #0 ; nextln: fcmp d0, d0 ; nextln: fcsel d0, d2, d1, ne ; nextln: fcvtzs w0, d0 @@ -814,9 +836,10 @@ block0(v0: f64): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 18446744073709552000 +; nextln: movz x0, #17392, LSL #48 +; nextln: fmov d1, x0 ; nextln: fmin d2, d0, d1 -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 0 +; nextln: movi v1.8b, #0 ; nextln: fmax d2, d2, d1 ; nextln: fcmp d0, d0 ; nextln: fcsel d0, d1, d2, ne @@ -833,11 +856,13 @@ block0(v0: f64): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldr d1, pc+8 ; b 12 ; data.f64 9223372036854776000 +; nextln: movz x0, #17376, LSL #48 +; nextln: fmov d1, x0 ; nextln: fmin d1, d0, d1 -; nextln: ldr d2, pc+8 ; b 12 ; data.f64 -9223372036854776000 +; nextln: movz x0, #50144, LSL #48 +; nextln: fmov d2, x0 ; nextln: fmax d1, d1, d2 -; nextln: ldr d2, pc+8 ; b 12 ; data.f64 0 +; nextln: movi v2.8b, #0 ; nextln: fcmp d0, d0 ; nextln: fcsel d0, d2, d1, ne ; nextln: fcvtzs x0, d0 diff --git a/cranelift/filetests/filetests/isa/aarch64/simd.clif b/cranelift/filetests/filetests/isa/aarch64/simd.clif new file mode 100644 index 0000000000..3e47bbbda2 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif @@ -0,0 +1,49 @@ +test compile +target aarch64 + +function %f1() -> i64x2 { +block0: + v0 = iconst.i64 281474976710657 + v1 = splat.i64x2 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #1 +; nextln: movk x0, #1, LSL #48 +; nextln: dup v0.2d, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f2() -> i16x8 { +block0: + v0 = iconst.i32 42679 + v1 = ireduce.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #42679 +; nextln: dup v0.8h, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f3() -> b8x16 { +block0: + v0 = bconst.b32 true + v1 = breduce.b8 v0 + v2 = splat.b8x16 v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movi v0.16b, #255 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret