diff --git a/ci/build-test-matrix.js b/ci/build-test-matrix.js index 2b46123368..ad09ea9c19 100644 --- a/ci/build-test-matrix.js +++ b/ci/build-test-matrix.js @@ -82,7 +82,7 @@ const array = [ "target": "riscv64gc-unknown-linux-gnu", "gcc_package": "gcc-riscv64-linux-gnu", "gcc": "riscv64-linux-gnu-gcc", - "qemu": "qemu-riscv64 -cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu", + "qemu": "qemu-riscv64 -cpu rv64,v=true,vlen=256,vext_spec=v1.0,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu", "qemu_target": "riscv64-linux-user", "name": "Test Linux riscv64", "filter": "linux-riscv64", diff --git a/cranelift/codegen/build.rs b/cranelift/codegen/build.rs index 8ee8b42676..c557e4baf3 100644 --- a/cranelift/codegen/build.rs +++ b/cranelift/codegen/build.rs @@ -274,6 +274,7 @@ fn get_isle_compilations( prelude_isle.clone(), prelude_lower_isle.clone(), src_isa_risc_v.join("inst.isle"), + src_isa_risc_v.join("inst_vector.isle"), src_isa_risc_v.join("lower.isle"), ], untracked_inputs: vec![clif_lower_isle.clone()], diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 3bb02c4ff8..6e88c6e777 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -329,6 +329,31 @@ (guard_size u32) (probe_count u32) (tmp WritableReg)) + + (VecAluRRR + (op VecAluOpRRR) + (vd WritableReg) + (vs1 Reg) + (vs2 Reg) + (vstate VState)) + + (VecSetState + (rd WritableReg) + (vstate VState)) + + (VecLoad + (eew VecElementWidth) + (to WritableReg) + (from VecAMode) + (flags MemFlags) + (vstate VState)) + + (VecStore + (eew VecElementWidth) + (to VecAMode) + (from Reg) + (flags MemFlags) + (vstate VState)) )) @@ -711,6 +736,9 @@ ;; ISA Extension helpers +(decl pure has_v () bool) +(extern constructor has_v has_v) + (decl pure has_zbkb () bool) (extern constructor has_zbkb has_zbkb) diff --git a/cranelift/codegen/src/isa/riscv64/inst/args.rs b/cranelift/codegen/src/isa/riscv64/inst/args.rs index 011bd14ad0..660c845bd1 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/args.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/args.rs @@ -1668,55 +1668,6 @@ impl CsrAddress { } } -pub(crate) struct VType { - vma: bool, - vta: bool, - vsew: Vsew, - valmul: Vlmul, -} - -impl VType { - fn as_u32(self) -> u32 { - self.valmul.as_u32() - | self.vsew.as_u32() << 3 - | if self.vta { 1 << 7 } else { 0 } - | if self.vma { 1 << 8 } else { 0 } - } - - const fn vill_bit() -> u64 { - 1 << 63 - } -} - -enum Vlmul { - vlmul_1_div_8 = 0b101, - vlmul_1_div_4 = 0b110, - vlmul_1_div_2 = 0b111, - vlmul_1 = 0b000, - vlmul_2 = 0b001, - vlmul_4 = 0b010, - vlmul_8 = 0b011, -} - -impl Vlmul { - fn as_u32(self) -> u32 { - self as u32 - } -} - -enum Vsew { - sew_8 = 0b000, - sew_16 = 0b001, - sew_32 = 0b010, - sew_64 = 0b011, -} - -impl Vsew { - fn as_u32(self) -> u32 { - self as u32 - } -} - impl CsrOP { pub(crate) fn op_name(self) -> &'static str { match self { @@ -1754,40 +1705,11 @@ impl CsrOP { if self.need_rs() { reg_to_gpr_num(rs.unwrap()) } else { - zimm.unwrap().as_u32() + zimm.unwrap().bits() } } } -enum Vxrm { - // round-to-nearest-up (add +0.5 LSB) - rnu = 0b00, - // round-to-nearest-even - rne = 0b01, - //round-down (truncate) - rdn = 0b10, - // round-to-odd (OR bits into LSB, aka "jam") - rod = 0b11, -} - -impl Vxrm { - pub(crate) fn as_u32(self) -> u32 { - self as u32 - } -} - -pub(crate) struct Vcsr { - xvrm: Vxrm, - // Fixed-point accrued saturation flag - vxsat: bool, -} - -impl Vcsr { - pub(crate) fn as_u32(self) -> u32 { - return if self.vxsat { 1 } else { 0 } | self.xvrm.as_u32(); - } -} - ///Atomic Memory ordering. #[derive(Copy, Clone, Debug)] pub enum AMO { diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index eb90867ae2..2928c2668c 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -102,6 +102,13 @@ pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 { u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap() } +#[derive(Clone, Debug, PartialEq, Default)] +pub enum EmitVState { + #[default] + Unknown, + Known(VState), +} + /// State carried between emissions of a sequence of instructions. #[derive(Default, Clone, Debug)] pub struct EmitState { @@ -114,6 +121,9 @@ pub struct EmitState { /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and /// optimized away at compiletime. See [cranelift_control]. ctrl_plane: ControlPlane, + /// Vector State + /// Controls the current state of the vector unit at the emission point. + vstate: EmitVState, } impl EmitState { @@ -141,6 +151,7 @@ impl MachInstEmitState for EmitState { stack_map: None, cur_srcloc: RelSourceLoc::default(), ctrl_plane, + vstate: EmitVState::Unknown, } } @@ -159,6 +170,11 @@ impl MachInstEmitState for EmitState { fn take_ctrl_plane(self) -> ControlPlane { self.ctrl_plane } + + fn on_new_block(&mut self) { + // Reset the vector state. + self.vstate = EmitVState::Unknown; + } } impl Inst { @@ -386,6 +402,80 @@ impl Inst { } insts } + + /// Returns Some(VState) if this insturction is expecting a specific vector state + /// before emission. + fn expected_vstate(&self) -> Option<&VState> { + match self { + Inst::Nop0 + | Inst::Nop4 + | Inst::BrTable { .. } + | Inst::Auipc { .. } + | Inst::Lui { .. } + | Inst::LoadConst32 { .. } + | Inst::LoadConst64 { .. } + | Inst::AluRRR { .. } + | Inst::FpuRRR { .. } + | Inst::AluRRImm12 { .. } + | Inst::Load { .. } + | Inst::Store { .. } + | Inst::Args { .. } + | Inst::Ret { .. } + | Inst::Extend { .. } + | Inst::AjustSp { .. } + | Inst::Call { .. } + | Inst::CallInd { .. } + | Inst::TrapIf { .. } + | Inst::Jal { .. } + | Inst::CondBr { .. } + | Inst::LoadExtName { .. } + | Inst::LoadAddr { .. } + | Inst::VirtualSPOffsetAdj { .. } + | Inst::Mov { .. } + | Inst::MovFromPReg { .. } + | Inst::Fence { .. } + | Inst::FenceI + | Inst::ECall + | Inst::EBreak + | Inst::Udf { .. } + | Inst::FpuRR { .. } + | Inst::FpuRRRR { .. } + | Inst::Jalr { .. } + | Inst::Atomic { .. } + | Inst::Select { .. } + | Inst::AtomicCas { .. } + | Inst::IntSelect { .. } + | Inst::Csr { .. } + | Inst::Icmp { .. } + | Inst::SelectReg { .. } + | Inst::FcvtToInt { .. } + | Inst::RawData { .. } + | Inst::AtomicStore { .. } + | Inst::AtomicLoad { .. } + | Inst::AtomicRmwLoop { .. } + | Inst::TrapIfC { .. } + | Inst::Unwind { .. } + | Inst::DummyUse { .. } + | Inst::FloatRound { .. } + | Inst::FloatSelect { .. } + | Inst::FloatSelectPseudo { .. } + | Inst::Popcnt { .. } + | Inst::Rev8 { .. } + | Inst::Cltz { .. } + | Inst::Brev8 { .. } + | Inst::StackProbeLoop { .. } => None, + // VecSetState does not expect any vstate, rather it updates it. + Inst::VecSetState { .. } => None, + + Inst::VecAluRRR { vstate, .. } | + // TODO: Unit-stride loads and stores only need the AVL to be correct, not + // the full vtype. A future optimization could be to decouple these two when + // updating vstate. This would allow us to avoid emitting a VecSetState in + // some cases. + Inst::VecLoad { vstate, .. } + | Inst::VecStore { vstate, .. } => Some(vstate), + } + } } impl MachInstEmit for Inst { @@ -400,6 +490,19 @@ impl MachInstEmit for Inst { state: &mut EmitState, ) { let mut allocs = AllocationConsumer::new(allocs); + + // Check if we need to update the vector state before emitting this instruction + if let Some(expected) = self.expected_vstate() { + if state.vstate != EmitVState::Known(expected.clone()) { + // Update the vector state. + Inst::VecSetState { + rd: writable_zero_reg(), + vstate: expected.clone(), + } + .emit(&[], sink, emit_info, state); + } + } + // N.B.: we *must* not exceed the "worst-case size" used to compute // where to insert islands, except when islands are explicitly triggered // (with an `EmitIsland`). We check this in debug builds. This is `mut` @@ -530,13 +633,14 @@ impl MachInstEmit for Inst { (rs1, rs2) }; - let x: u32 = alu_op.op_code() - | reg_to_gpr_num(rd.to_reg()) << 7 - | (alu_op.funct3()) << 12 - | reg_to_gpr_num(rs1) << 15 - | reg_to_gpr_num(rs2) << 20 - | alu_op.funct7() << 25; - sink.put4(x); + sink.put4(encode_r_type( + alu_op.op_code(), + rd.to_reg(), + alu_op.funct3(), + rs1, + rs2, + alu_op.funct7(), + )); } &Inst::AluRRImm12 { alu_op, @@ -2695,6 +2799,120 @@ impl MachInstEmit for Inst { .emit(&[], sink, emit_info, state); sink.bind_label(label_done, &mut state.ctrl_plane); } + &Inst::VecAluRRR { + op, vd, vs1, vs2, .. + } => { + let vs1 = allocs.next(vs1); + let vs2 = allocs.next(vs2); + let vd = allocs.next_writable(vd); + + // This is the mask bit, we don't yet implement masking, so set it to 1, which means + // masking disabled. + let vm = 1; + + sink.put4(encode_valu( + op.opcode(), + vd.to_reg(), + op.funct3(), + vs1, + vs2, + vm, + op.funct6(), + )); + } + &Inst::VecSetState { rd, ref vstate } => { + let rd = allocs.next_writable(rd); + + sink.put4(encode_vcfg_imm( + 0x57, + rd.to_reg(), + vstate.avl.unwrap_static(), + &vstate.vtype, + )); + + // Update the current vector emit state. + state.vstate = EmitVState::Known(vstate.clone()); + } + + &Inst::VecLoad { + eew, + to, + ref from, + flags, + .. + } => { + let offset = from.get_offset_with_state(state); + let from_reg = allocs.next(from.get_base_register()); + let to = allocs.next_writable(to); + + // Vector Loads don't support immediate offsets, so we need to load it into a register. + let addr = writable_spilltmp_reg(); + LoadConstant::U64(offset as u64) + .load_constant_and_add(addr, from_reg) + .into_iter() + .for_each(|inst| inst.emit(&[], sink, emit_info, state)); + + let srcloc = state.cur_srcloc(); + if !srcloc.is_default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(TrapCode::HeapOutOfBounds); + } + + // This is the mask bit, we don't yet implement masking, so set it to 1, which means + // masking disabled. + let vm = 1; + + sink.put4(encode_vmem_load( + 0x07, + to.to_reg(), + eew, + addr.to_reg(), + from.lumop(), + vm, + from.mop(), + from.nf(), + )); + } + + &Inst::VecStore { + eew, + ref to, + from, + flags, + .. + } => { + let offset = to.get_offset_with_state(state); + let to_reg = allocs.next(to.get_base_register()); + let from = allocs.next(from); + + // Vector Stores don't support immediate offsets, so we need to load it into a register. + let addr = writable_spilltmp_reg(); + LoadConstant::U64(offset as u64) + .load_constant_and_add(addr, to_reg) + .into_iter() + .for_each(|inst| inst.emit(&[], sink, emit_info, state)); + + let srcloc = state.cur_srcloc(); + if !srcloc.is_default() && !flags.notrap() { + // Register the offset at which the actual load instruction starts. + sink.add_trap(TrapCode::HeapOutOfBounds); + } + + // This is the mask bit, we don't yet implement masking, so set it to 1, which means + // masking disabled. + let vm = 1; + + sink.put4(encode_vmem_store( + 0x27, + from, + eew, + addr.to_reg(), + to.sumop(), + vm, + to.mop(), + to.nf(), + )); + } }; let end_off = sink.cur_offset(); assert!( diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs new file mode 100644 index 0000000000..38d73f2dbf --- /dev/null +++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs @@ -0,0 +1,128 @@ +//! Contains the RISC-V instruction encoding logic. +//! +//! These formats are specified in the RISC-V specification in section 2.2. +//! See: https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf +//! +//! Some instructions especially in extensions have slight variations from +//! the base RISC-V specification. + +use super::{UImm5, VType}; +use crate::isa::riscv64::inst::reg_to_gpr_num; +use crate::isa::riscv64::lower::isle::generated_code::VecElementWidth; +use crate::Reg; + +/// Encode an R-type instruction. +/// +/// Layout: +/// 0-------6-7-------11-12------14-15------19-20------24-25-------31 +/// | Opcode | rd | funct3 | rs1 | rs2 | funct7 | +pub fn encode_r_type(opcode: u32, rd: Reg, funct3: u32, rs1: Reg, rs2: Reg, funct7: u32) -> u32 { + let mut bits = 0; + bits |= opcode & 0b1111111; + bits |= reg_to_gpr_num(rd) << 7; + bits |= (funct3 & 0b111) << 12; + bits |= reg_to_gpr_num(rs1) << 15; + bits |= reg_to_gpr_num(rs2) << 20; + bits |= (funct7 & 0b1111111) << 25; + bits +} + +/// Encodes a Vector ALU instruction. +/// +/// Fields: +/// - opcode (7 bits) +/// - vd (5 bits) +/// - funct3 (3 bits) +/// - vs1 (5 bits) +/// - vs2 (5 bits) +/// - vm (1 bit) +/// - funct6 (6 bits) +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc +pub fn encode_valu( + opcode: u32, + vd: Reg, + funct3: u32, + vs1: Reg, + vs2: Reg, + vm: u32, + funct6: u32, +) -> u32 { + let funct6 = funct6 & 0b111111; + let vm = vm & 0b1; + let funct7 = (funct6 << 6) | vm; + encode_r_type(opcode, vd, funct3, vs1, vs2, funct7) +} + +/// Encodes a Vector CFG Imm instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc +// TODO: Check if this is any of the known instruction types in the spec. +pub fn encode_vcfg_imm(opcode: u32, rd: Reg, imm: UImm5, vtype: &VType) -> u32 { + let mut bits = 0; + bits |= opcode & 0b1111111; + bits |= reg_to_gpr_num(rd) << 7; + bits |= 0b111 << 12; + bits |= (imm.bits() & 0b11111) << 15; + bits |= (vtype.encode() & 0b1111111111) << 20; + bits |= 0b11 << 30; + bits +} + +/// Encodes a Vector Mem Unit Stride Load instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc +/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP +pub fn encode_vmem_load( + opcode: u32, + vd: Reg, + width: VecElementWidth, + rs1: Reg, + lumop: u32, + vm: u32, + mop: u32, + nf: u32, +) -> u32 { + // Width is encoded differently to avoid a clash with the FP load/store sizes. + let width = match width { + VecElementWidth::E8 => 0b000, + VecElementWidth::E16 => 0b101, + VecElementWidth::E32 => 0b110, + VecElementWidth::E64 => 0b111, + }; + + let mut bits = 0; + bits |= opcode & 0b1111111; + bits |= reg_to_gpr_num(vd) << 7; + bits |= width << 12; + bits |= reg_to_gpr_num(rs1) << 15; + bits |= (lumop & 0b11111) << 20; + bits |= (vm & 0b1) << 25; + bits |= (mop & 0b11) << 26; + + // The mew bit (inst[28]) when set is expected to be used to encode expanded + // memory sizes of 128 bits and above, but these encodings are currently reserved. + bits |= 0b0 << 28; + + bits |= (nf & 0b111) << 29; + bits +} + +/// Encodes a Vector Mem Unit Stride Load instruction. +/// +/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc +/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP +pub fn encode_vmem_store( + opcode: u32, + vs3: Reg, + width: VecElementWidth, + rs1: Reg, + sumop: u32, + vm: u32, + mop: u32, + nf: u32, +) -> u32 { + // This is pretty much the same as the load instruction, just + // with different names on the fields. + encode_vmem_load(opcode, vs3, width, rs1, sumop, vm, mop, nf) +} diff --git a/cranelift/codegen/src/isa/riscv64/inst/imms.rs b/cranelift/codegen/src/isa/riscv64/inst/imms.rs index bee1971636..2d7bc8b630 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/imms.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/imms.rs @@ -99,33 +99,31 @@ impl Display for Imm20 { } } -#[derive(Clone, Copy)] -pub struct Uimm5 { - bits: u8, +/// An unsigned 5-bit immediate. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct UImm5 { + value: u8, } -impl Uimm5 { - pub fn from_bits(bits: u8) -> Self { - Self { bits } +impl UImm5 { + /// Create an unsigned 5-bit immediate from u8. + pub fn maybe_from_u8(value: u8) -> Option { + if value < 32 { + Some(UImm5 { value }) + } else { + None + } } - /// Create a zero immediate of this format. - pub fn zero() -> Self { - Self { bits: 0 } - } - pub fn as_u32(&self) -> u32 { - (self.bits as u32) & 0b1_1111 + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + u32::from(self.value) } } -impl Debug for Uimm5 { +impl Display for UImm5 { fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "{}", self.bits) - } -} - -impl Display for Uimm5 { - fn fmt(&self, f: &mut Formatter<'_>) -> Result { - write!(f, "{}", self.bits) + write!(f, "{}", self.value) } } diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index ea61641da3..beb40815d8 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -4,9 +4,10 @@ #![allow(dead_code)] #![allow(non_camel_case_types)] +use super::lower::isle::generated_code::{VecAMode, VecElementWidth}; use crate::binemit::{Addend, CodeOffset, Reloc}; pub use crate::ir::condcodes::IntCC; -use crate::ir::types::{F32, F64, I128, I16, I32, I64, I8, R32, R64}; +use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, R32, R64}; pub use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, Type, ValueLabel}; use crate::isa::CallConv; @@ -29,6 +30,10 @@ pub mod args; pub use self::args::*; pub mod emit; pub use self::emit::*; +pub mod vector; +pub use self::vector::*; +pub mod encode; +pub use self::encode::*; pub mod unwind; use crate::isa::riscv64::abi::Riscv64MachineDeps; @@ -41,7 +46,7 @@ use std::fmt::{Display, Formatter}; pub(crate) type OptionReg = Option; pub(crate) type OptionImm12 = Option; pub(crate) type VecBranchTarget = Vec; -pub(crate) type OptionUimm5 = Option; +pub(crate) type OptionUimm5 = Option; pub(crate) type OptionFloatRoundingMode = Option; pub(crate) type VecU8 = Vec; pub(crate) type VecWritableReg = Vec>; @@ -313,21 +318,41 @@ impl Inst { /// Generic constructor for a load (zero-extending where appropriate). pub fn gen_load(into_reg: Writable, mem: AMode, ty: Type, flags: MemFlags) -> Inst { - Inst::Load { - rd: into_reg, - op: LoadOP::from_type(ty), - from: mem, - flags, + if ty.is_vector() { + Inst::VecLoad { + eew: VecElementWidth::from_type(ty), + to: into_reg, + from: VecAMode::UnitStride { base: mem }, + flags, + vstate: VState::from_type(ty), + } + } else { + Inst::Load { + rd: into_reg, + op: LoadOP::from_type(ty), + from: mem, + flags, + } } } /// Generic constructor for a store. pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst { - Inst::Store { - src: from_reg, - op: StoreOP::from_type(ty), - to: mem, - flags, + if ty.is_vector() { + Inst::VecStore { + eew: VecElementWidth::from_type(ty), + to: VecAMode::UnitStride { base: mem }, + from: from_reg, + flags, + vstate: VState::from_type(ty), + } + } else { + Inst::Store { + src: from_reg, + op: StoreOP::from_type(ty), + to: mem, + flags, + } } } } @@ -623,6 +648,22 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan // gen_prologue is called at emit stage. // no need let reg alloc know. } + &Inst::VecAluRRR { vd, vs1, vs2, .. } => { + collector.reg_use(vs1); + collector.reg_use(vs2); + collector.reg_def(vd); + } + &Inst::VecSetState { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecLoad { to, ref from, .. } => { + collector.reg_use(from.get_base_register()); + collector.reg_def(to); + } + &Inst::VecStore { ref to, from, .. } => { + collector.reg_use(to.get_base_register()); + collector.reg_use(from); + } } } @@ -727,6 +768,7 @@ impl MachInst for Inst { F32 => Ok((&[RegClass::Float], &[F32])), F64 => Ok((&[RegClass::Float], &[F64])), I128 => Ok((&[RegClass::Int, RegClass::Int], &[I64, I64])), + _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])), _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", ty @@ -784,6 +826,17 @@ pub fn reg_name(reg: Reg) -> String { } } } +pub fn vec_reg_name(reg: Reg) -> String { + match reg.to_real_reg() { + Some(real) => { + assert_eq!(real.class(), RegClass::Float); + format!("v{}", real.hw_enc()) + } + None => { + format!("{:?}", reg) + } + } +} impl Inst { fn print_with_state( @@ -795,6 +848,16 @@ impl Inst { let reg = allocs.next(reg); reg_name(reg) }; + let format_vec_reg = |reg: Reg, allocs: &mut AllocationConsumer<'_>| -> String { + let reg = allocs.next(reg); + vec_reg_name(reg) + }; + + let format_vec_amode = |amode: &VecAMode, allocs: &mut AllocationConsumer<'_>| -> String { + match amode { + VecAMode::UnitStride { base } => base.to_string_with_alloc(allocs), + } + }; let format_regs = |regs: &[Reg], allocs: &mut AllocationConsumer<'_>| -> String { let mut x = if regs.len() > 1 { @@ -839,6 +902,7 @@ impl Inst { "".into() } } + match self { &Inst::Nop0 => { format!("##zero length nop") @@ -1501,6 +1565,48 @@ impl Inst { &MInst::Udf { trap_code } => format!("udf##trap_code={}", trap_code), &MInst::EBreak {} => String::from("ebreak"), &MInst::ECall {} => String::from("ecall"), + &Inst::VecAluRRR { + op, + vd, + vs1, + vs2, + ref vstate, + } => { + let vs1_s = format_vec_reg(vs1, allocs); + let vs2_s = format_vec_reg(vs2, allocs); + let vd_s = format_vec_reg(vd.to_reg(), allocs); + + // Note: vs2 and vs1 here are opposite to the standard scalar ordering. + // This is noted in Section 10.1 of the RISC-V Vector spec. + format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate) + } + &Inst::VecSetState { rd, ref vstate } => { + let rd_s = format_reg(rd.to_reg(), allocs); + assert!(vstate.avl.is_static()); + format!("vsetivli {}, {}, {}", rd_s, vstate.avl, vstate.vtype) + } + Inst::VecLoad { + eew, + to, + from, + ref vstate, + .. + } => { + let base = format_vec_amode(from, allocs); + let vd = format_vec_reg(to.to_reg(), allocs); + format!("vl{}.v {},{} {}", eew, vd, base, vstate) + } + Inst::VecStore { + eew, + to, + from, + ref vstate, + .. + } => { + let dst = format_vec_amode(to, allocs); + let vs3 = format_vec_reg(*from, allocs); + format!("vs{}.v {},{} {}", eew, vs3, dst, vstate) + } } } } diff --git a/cranelift/codegen/src/isa/riscv64/inst/regs.rs b/cranelift/codegen/src/isa/riscv64/inst/regs.rs index 35cef328c2..a0cba0a15e 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/regs.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/regs.rs @@ -218,3 +218,13 @@ pub(crate) fn x_reg_range(start: usize, end: usize) -> Vec> { } regs } + +#[inline] +pub fn v_reg(enc: usize) -> Reg { + let p_reg = PReg::new(enc, RegClass::Float); + let v_reg = VReg::new(p_reg.index(), p_reg.class()); + Reg::from(v_reg) +} +pub fn vx_reg(enc: usize) -> PReg { + PReg::new(enc, RegClass::Float) +} diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs new file mode 100644 index 0000000000..583896a02f --- /dev/null +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -0,0 +1,289 @@ +use crate::isa::riscv64::inst::EmitState; +use crate::isa::riscv64::lower::isle::generated_code::{ + VecAMode, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode, VecTailMode, +}; +use crate::Reg; +use core::fmt; + +use super::{Type, UImm5}; + +impl VecAvl { + pub fn _static(size: u32) -> Self { + VecAvl::Static { + size: UImm5::maybe_from_u8(size as u8).expect("Invalid size for AVL"), + } + } + + pub fn is_static(&self) -> bool { + match self { + VecAvl::Static { .. } => true, + } + } + + pub fn unwrap_static(&self) -> UImm5 { + match self { + VecAvl::Static { size } => *size, + } + } +} + +// TODO: Can we tell ISLE to derive this? +impl PartialEq for VecAvl { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (VecAvl::Static { size: lhs }, VecAvl::Static { size: rhs }) => lhs == rhs, + } + } +} + +impl fmt::Display for VecAvl { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecAvl::Static { size } => write!(f, "{}", size), + } + } +} + +impl VecElementWidth { + pub fn from_type(ty: Type) -> Self { + Self::from_bits(ty.lane_bits()) + } + + pub fn from_bits(bits: u32) -> Self { + match bits { + 8 => VecElementWidth::E8, + 16 => VecElementWidth::E16, + 32 => VecElementWidth::E32, + 64 => VecElementWidth::E64, + _ => panic!("Invalid number of bits for VecElementWidth: {}", bits), + } + } + + pub fn bits(&self) -> u32 { + match self { + VecElementWidth::E8 => 8, + VecElementWidth::E16 => 16, + VecElementWidth::E32 => 32, + VecElementWidth::E64 => 64, + } + } + + pub fn encode(&self) -> u32 { + match self { + VecElementWidth::E8 => 0b000, + VecElementWidth::E16 => 0b001, + VecElementWidth::E32 => 0b010, + VecElementWidth::E64 => 0b011, + } + } +} + +impl fmt::Display for VecElementWidth { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "e{}", self.bits()) + } +} + +impl VecLmul { + pub fn encode(&self) -> u32 { + match self { + VecLmul::LmulF8 => 0b101, + VecLmul::LmulF4 => 0b110, + VecLmul::LmulF2 => 0b111, + VecLmul::Lmul1 => 0b000, + VecLmul::Lmul2 => 0b001, + VecLmul::Lmul4 => 0b010, + VecLmul::Lmul8 => 0b011, + } + } +} + +impl fmt::Display for VecLmul { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecLmul::LmulF8 => write!(f, "mf8"), + VecLmul::LmulF4 => write!(f, "mf4"), + VecLmul::LmulF2 => write!(f, "mf2"), + VecLmul::Lmul1 => write!(f, "m1"), + VecLmul::Lmul2 => write!(f, "m2"), + VecLmul::Lmul4 => write!(f, "m4"), + VecLmul::Lmul8 => write!(f, "m8"), + } + } +} + +impl VecTailMode { + pub fn encode(&self) -> u32 { + match self { + VecTailMode::Agnostic => 1, + VecTailMode::Undisturbed => 0, + } + } +} + +impl fmt::Display for VecTailMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecTailMode::Agnostic => write!(f, "ta"), + VecTailMode::Undisturbed => write!(f, "tu"), + } + } +} + +impl VecMaskMode { + pub fn encode(&self) -> u32 { + match self { + VecMaskMode::Agnostic => 1, + VecMaskMode::Undisturbed => 0, + } + } +} + +impl fmt::Display for VecMaskMode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecMaskMode::Agnostic => write!(f, "ma"), + VecMaskMode::Undisturbed => write!(f, "mu"), + } + } +} + +/// Vector Type (VType) +/// +/// vtype provides the default type used to interpret the contents of the vector register file. +#[derive(Clone, Debug, PartialEq)] +pub struct VType { + pub sew: VecElementWidth, + pub lmul: VecLmul, + pub tail_mode: VecTailMode, + pub mask_mode: VecMaskMode, +} + +impl VType { + // https://github.com/riscv/riscv-v-spec/blob/master/vtype-format.adoc + pub fn encode(&self) -> u32 { + let mut bits = 0; + bits |= self.lmul.encode(); + bits |= self.sew.encode() << 3; + bits |= self.tail_mode.encode() << 6; + bits |= self.mask_mode.encode() << 7; + bits + } +} + +impl fmt::Display for VType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}, {}, {}, {}", + self.sew, self.lmul, self.tail_mode, self.mask_mode + ) + } +} + +/// Vector State (VState) +/// +/// VState represents the state of the vector unit that each instruction expects before execution. +/// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is +/// used by our instruction emission code to ensure that the vector unit is in the correct state. +#[derive(Clone, Debug, PartialEq)] +pub struct VState { + pub avl: VecAvl, + pub vtype: VType, +} + +impl VState { + pub fn from_type(ty: Type) -> Self { + VState { + avl: VecAvl::_static(ty.lane_count()), + vtype: VType { + sew: VecElementWidth::from_type(ty), + lmul: VecLmul::Lmul1, + tail_mode: VecTailMode::Agnostic, + mask_mode: VecMaskMode::Agnostic, + }, + } + } +} + +impl fmt::Display for VState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "#avl={}, #vtype=({})", self.avl, self.vtype) + } +} + +impl VecAluOpRRR { + pub fn opcode(&self) -> u32 { + match self { + VecAluOpRRR::Vadd => 0x57, + } + } + pub fn funct3(&self) -> u32 { + match self { + VecAluOpRRR::Vadd => 0b000, + } + } + pub fn funct6(&self) -> u32 { + match self { + VecAluOpRRR::Vadd => 0b000000, + } + } +} + +impl fmt::Display for VecAluOpRRR { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VecAluOpRRR::Vadd => write!(f, "vadd.vv"), + } + } +} + +impl VecAMode { + pub fn get_base_register(&self) -> Reg { + match self { + VecAMode::UnitStride { base, .. } => base.get_base_register(), + } + } + + pub(crate) fn get_offset_with_state(&self, state: &EmitState) -> i64 { + match self { + VecAMode::UnitStride { base, .. } => base.get_offset_with_state(state), + } + } + + /// `mop` field, described in Table 7 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn mop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00, + } + } + + /// `lumop` field, described in Table 9 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn lumop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00000, + } + } + + /// `sumop` field, described in Table 10 of Section 7.2. Vector Load/Store Addressing Modes + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn sumop(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b00000, + } + } + + /// The `nf[2:0]` field encodes the number of fields in each segment. For regular vector loads and + /// stores, nf=0, indicating that a single value is moved between a vector register group and memory + /// at each element position. Larger values in the nf field are used to access multiple contiguous + /// fields within a segment as described in Section 7.8 Vector Load/Store Segment Instructions. + /// + /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes + pub fn nf(&self) -> u32 { + match self { + VecAMode::UnitStride { .. } => 0b000, + } + } +} diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle new file mode 100644 index 0000000000..7d56576e7c --- /dev/null +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -0,0 +1,132 @@ +;; Represents the possible widths of an element when used in an operation. +(type VecElementWidth (enum + (E8) + (E16) + (E32) + (E64) +)) + +;; Vector Register Group Multiplier (LMUL) +;; +;; The LMUL setting specifies how we should group registers together. LMUL can +;; also be a fractional value, reducing the number of bits used in a single +;; vector register. Fractional LMUL is used to increase the number of effective +;; usable vector register groups when operating on mixed-width values. +(type VecLmul (enum + (LmulF8) + (LmulF4) + (LmulF2) + (Lmul1) + (Lmul2) + (Lmul4) + (Lmul8) +)) + +;; Tail Mode +;; +;; The tail mode specifies how the tail elements of a vector register are handled. +(type VecTailMode (enum + ;; Tail Agnostic means that the tail elements are left in an undefined state. + (Agnostic) + ;; Tail Undisturbed means that the tail elements are left in their original values. + (Undisturbed) +)) + +;; Mask Mode +;; +;; The mask mode specifies how the masked elements of a vector register are handled. +(type VecMaskMode (enum + ;; Mask Agnostic means that the masked out elements are left in an undefined state. + (Agnostic) + ;; Mask Undisturbed means that the masked out elements are left in their original values. + (Undisturbed) +)) + +;; Application Vector Length (AVL) +;; +;; This setting specifies the number of elements that are going to be processed +;; in a single instruction. Note: We may end up processing fewer elements than +;; the AVL setting, if they don't fit in a single register. +(type VecAvl (enum + ;; Static AVL emits a `vsetivli` that uses a constant value + (Static (size UImm5)) + ;; TODO: Add a dynamic, register based AVL mode when we are able to properly test it +)) + +(type VType (primitive VType)) +(type VState (primitive VState)) + +;; Register to Register ALU Ops +(type VecAluOpRRR (enum + (Vadd) +)) + + + +;; Vector Addressing Mode +(type VecAMode (enum + ;; Vector unit-stride operations access elements stored contiguously in memory + ;; starting from the base effective address. + (UnitStride + (base AMode)) + ;; TODO: Constant Stride + ;; TODO: Indexed Operations +)) + + +;; Builds a static VState matching a SIMD type. +;; The VState is guaranteed to be static with AVL set to the number of lanes. +;; Element size is set to the size of the type. +;; LMUL is set to 1. +;; Tail mode is set to agnostic. +;; Mask mode is set to agnostic. +(decl pure vstate_from_type (Type) VState) +(extern constructor vstate_from_type vstate_from_type) +(convert Type VState vstate_from_type) + +;; Extracts an element width from a SIMD type. +(decl pure element_width_from_type (Type) VecElementWidth) +(rule (element_width_from_type ty) + (if-let $I8 (lane_type ty)) + (VecElementWidth.E8)) +(rule (element_width_from_type ty) + (if-let $I16 (lane_type ty)) + (VecElementWidth.E16)) +(rule (element_width_from_type ty) + (if-let $I32 (lane_type ty)) + (VecElementWidth.E32)) +(rule (element_width_from_type ty) + (if-let $I64 (lane_type ty)) + (VecElementWidth.E64)) + +;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; As noted in the RISC-V Vector Extension Specification, rs2 is the first +;; source register and rs1 is the second source register. This is the opposite +;; of the usual RISC-V register order. +;; See Section 10.1 of the RISC-V Vector Extension Specification. + +;; Helper for emitting `MInst.VecAluRRR` instructions. +(decl vec_alu_rrr (VecAluOpRRR Reg Reg VState) Reg) +(rule (vec_alu_rrr op vs2 vs1 vstate) + (let ((vd WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 vstate)))) + vd)) + +;; Helper for emitting `MInst.VecLoad` instructions. +(decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg) +(rule (vec_load eew from flags vstate) + (let ((vd WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecLoad eew vd from flags vstate)))) + vd)) + +;; Helper for emitting `MInst.VecStore` instructions. +(decl vec_store (VecElementWidth VecAMode Reg MemFlags VState) InstOutput) +(rule (vec_store eew to from flags vstate) + (side_effect + (SideEffectNoResult.Inst (MInst.VecStore eew to from flags vstate)))) + +;; Helper for emitting the `vadd.vv` instruction. +(decl rv_vadd_vv (Reg Reg VState) Reg) +(rule (rv_vadd_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.Vadd) vs2 vs1 vstate)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index b5a9c46147..4df78191d4 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -53,7 +53,7 @@ (rule (match_shnadd (u64_from_imm64 1)) (AluOPRRR.Sh1add)) (rule (match_shnadd (u64_from_imm64 2)) (AluOPRRR.Sh2add)) (rule (match_shnadd (u64_from_imm64 3)) (AluOPRRR.Sh3add)) - + (rule 3 (lower (has_type $I64 (iadd x (ishl y (maybe_uextend (iconst n)))))) (if-let $true (has_zba)) (if-let shnadd (match_shnadd n)) @@ -75,7 +75,7 @@ (rule (match_shnadd_uw (u64_from_imm64 1)) (AluOPRRR.Sh1adduw)) (rule (match_shnadd_uw (u64_from_imm64 2)) (AluOPRRR.Sh2adduw)) (rule (match_shnadd_uw (u64_from_imm64 3)) (AluOPRRR.Sh3adduw)) - + (rule 5 (lower (has_type $I64 (iadd x (ishl (uextend y @ (value_type $I32)) (maybe_uextend (iconst n)))))) (if-let $true (has_zba)) (if-let shnadd_uw (match_shnadd_uw n)) @@ -97,6 +97,11 @@ (high Reg (rv_add high_tmp carry))) (value_regs low high))) +;; SIMD Vectors +(rule 8 (lower (has_type (ty_vec128_int ty) (iadd x y))) + (if-let $true (has_v)) + (rv_vadd_vv x y ty)) + ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;; (rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap x y tc))) @@ -374,7 +379,7 @@ (rule 1 (lower (has_type $I128 (clz x))) (lower_clz_i128 x)) - + ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (fits_in_64 ty) (cls x))) (lower_cls ty x)) @@ -809,6 +814,12 @@ (lower (has_type $I128 (load flags p @ (value_type (ty_addr64 _)) offset))) (gen_load_128 p offset flags)) +(rule 2 + (lower (has_type (ty_vec128_int ty) (load flags p @ (value_type (ty_addr64 _)) offset))) + (if-let $true (has_v)) + (let ((eew VecElementWidth (element_width_from_type ty))) + (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags ty))) + ;;;;; Rules for `istore8`;;;;;;;;; (rule (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset)) @@ -833,6 +844,12 @@ (lower (store flags x @ (value_type $I128 ) p @ (value_type (ty_addr64 _)) offset)) (gen_store_128 p offset flags x)) +(rule 2 + (lower (store flags x @ (value_type (ty_vec128_int ty)) p @ (value_type (ty_addr64 _)) offset)) + (if-let $true (has_v)) + (let ((eew VecElementWidth (element_width_from_type ty))) + (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags ty))) + (decl gen_icmp (IntCC ValueRegs ValueRegs Type) Reg) (rule (gen_icmp cc x y ty) diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index 7a0131982a..0dbcf937e2 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -283,6 +283,10 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> { ValueRegs::two(shamt, len_sub_shamt) } + fn has_v(&mut self) -> bool { + self.backend.isa_flags.has_v() + } + fn has_zbkb(&mut self) -> bool { self.backend.isa_flags.has_zbkb() } @@ -428,6 +432,11 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> { rs2, } } + + #[inline] + fn vstate_from_type(&mut self, ty: Type) -> VState { + VState::from_type(ty) + } } impl IsleContext<'_, '_, MInst, Riscv64Backend> { diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs index 237497afac..144bb10510 100644 --- a/cranelift/codegen/src/machinst/mod.rs +++ b/cranelift/codegen/src/machinst/mod.rs @@ -288,6 +288,9 @@ pub trait MachInstEmitState: Default + Clone + Debug { /// Used to continue using a control plane after the emission state is /// not needed anymore. fn take_ctrl_plane(self) -> ControlPlane; + /// A hook that triggers when first emitting a new block. + /// It is guaranteed to be called before any instructions are emitted. + fn on_new_block(&mut self) {} } /// The result of a `MachBackend::compile_function()` call. Contains machine diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index 4cfa3b8c59..3d0e48059f 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -843,6 +843,11 @@ impl VCode { for (block_order_idx, &block) in final_order.iter().enumerate() { trace!("emitting block {:?}", block); + + // Call the new block hook for state + state.on_new_block(); + + // Emit NOPs to align the block. let new_offset = I::align_basic_block(buffer.cur_offset()); while new_offset > buffer.cur_offset() { // Pad with NOPs up to the aligned block offset. diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif new file mode 100644 index 0000000000..787d8b0b6a --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif @@ -0,0 +1,578 @@ +test compile precise-output +target riscv64 has_v + +;; Tests both ABI and Regalloc spill/reload. +function %simd_spill( + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + ;; These cannot fit in registers. + i32x4, i32x4 +) -> + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, + ;; These cannot fit in registers. + i32x4, i32x4 system_v +{ +block0( + v0:i32x4, v1:i32x4, v2:i32x4, v3:i32x4, v4:i32x4, v5:i32x4, v6:i32x4, v7:i32x4, + v8:i32x4, v9:i32x4, v10:i32x4, v11:i32x4, v12:i32x4, v13:i32x4, v14:i32x4, v15:i32x4, + v16:i32x4, v17:i32x4, v18:i32x4, v19:i32x4, v20:i32x4, v21:i32x4, v22:i32x4, v23:i32x4, + v24:i32x4, v25:i32x4, v26:i32x4, v27:i32x4, v28:i32x4, v29:i32x4, v30:i32x4, v31:i32x4, + v32:i32x4, v33:i32x4 +): + ;; This just reverses the args + return v33, v32, + v31, v30, v29, v28, v27, v26, v25, v24, + v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, + v7, v6, v5, v4, v3, v2, v1, v0 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; fsd fs0,-8(sp) +; fsd fs2,-16(sp) +; fsd fs3,-24(sp) +; fsd fs4,-32(sp) +; fsd fs5,-40(sp) +; fsd fs6,-48(sp) +; fsd fs7,-56(sp) +; fsd fs8,-64(sp) +; fsd fs9,-72(sp) +; fsd fs10,-80(sp) +; fsd fs11,-88(sp) +; add sp,-112 +; block0: +; fsd fa0,0(nominal_sp) +; fsd fa1,8(nominal_sp) +; vle8.v v28,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v29,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v30,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v31,64(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v0,80(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,96(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,112(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,128(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,144(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v7,160(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,176(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v6,192(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v25,208(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v27,224(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v9,240(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v19,256(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v21,272(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v23,288(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v26,304(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v8,320(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v18,336(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v20,352(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v22,368(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v24,384(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v11,400(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v10,416(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v24,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v22,16(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v20,32(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v18,48(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v8,64(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v26,80(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v23,96(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v21,112(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v19,128(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v9,144(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v27,160(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v25,176(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,192(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,208(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v7,224(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,240(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,256(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,272(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v1,288(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v0,304(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v31,320(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v30,336(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v29,352(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v28,368(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v17,384(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v16,400(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v15,416(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v14,432(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v13,448(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v12,464(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; fld fa4,8(nominal_sp) +; vse8.v v14,480(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; fld fa7,0(nominal_sp) +; vse8.v v17,496(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; add sp,+112 +; fld fs0,-8(sp) +; fld fs2,-16(sp) +; fld fs3,-24(sp) +; fld fs4,-32(sp) +; fld fs5,-40(sp) +; fld fs6,-48(sp) +; fld fs7,-56(sp) +; fld fs8,-64(sp) +; fld fs9,-72(sp) +; fld fs10,-80(sp) +; fld fs11,-88(sp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; fsd fs0, -8(sp) +; fsd fs2, -0x10(sp) +; fsd fs3, -0x18(sp) +; fsd fs4, -0x20(sp) +; fsd fs5, -0x28(sp) +; fsd fs6, -0x30(sp) +; fsd fs7, -0x38(sp) +; fsd fs8, -0x40(sp) +; fsd fs9, -0x48(sp) +; fsd fs10, -0x50(sp) +; fsd fs11, -0x58(sp) +; addi sp, sp, -0x70 +; block1: ; offset 0x40 +; fsd fa0, 0(sp) +; fsd fa1, 8(sp) +; .byte 0x57, 0x70, 0x08, 0xcc +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x10, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8e, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x20, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8e, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x30, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8f, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x40, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8f, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x50, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x80, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x60, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x80, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x70, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x81, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x80, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x81, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x90, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x82, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xa0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x83, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xb0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x82, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xc0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x83, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xd0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8c, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xe0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8d, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xf0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x89, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x10, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8a, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x20, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x8b, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x30, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8d, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x40, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x50, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x89, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x60, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8a, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x70, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8b, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x80, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x8c, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x90, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x87, 0x85, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xa0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, s0 +; .byte 0x07, 0x85, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8c, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x10, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8b, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x20, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8a, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x30, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x89, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x40, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x50, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8d, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x60, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8b, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x70, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8a, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x80, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x89, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x90, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x84, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xa0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8d, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xb0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8c, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xc0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x83, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xd0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x82, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xe0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x83, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xf0, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x82, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x81, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x10, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x81, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x20, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x80, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x30, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x80, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x40, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8f, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x50, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8f, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x60, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x8e, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x70, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x8e, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x80, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x88, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x90, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x88, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xa0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x87, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xb0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x87, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xc0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x86, 0x0f, 0x02 +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xd0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x86, 0x0f, 0x02 +; fld fa4, 8(sp) +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xe0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x87, 0x0f, 0x02 +; fld fa7, 0(sp) +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0xf0, 0x01, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0xa7, 0x88, 0x0f, 0x02 +; addi sp, sp, 0x70 +; fld fs0, -8(sp) +; fld fs2, -0x10(sp) +; fld fs3, -0x18(sp) +; fld fs4, -0x20(sp) +; fld fs5, -0x28(sp) +; fld fs6, -0x30(sp) +; fld fs7, -0x38(sp) +; fld fs8, -0x40(sp) +; fld fs9, -0x48(sp) +; fld fs10, -0x50(sp) +; fld fs11, -0x58(sp) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif new file mode 100644 index 0000000000..1fb20ca92f --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif @@ -0,0 +1,73 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %iadd_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; block0: +; vadd.vv v10,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x05, 0xb5, 0x02 +; ret + +function %iadd_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; block0: +; vadd.vv v10,v11,v10 #avl=8, #vtype=(e16, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x05, 0xb5, 0x02 +; ret + +function %iadd_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; block0: +; vadd.vv v10,v11,v10 #avl=4, #vtype=(e32, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x05, 0xb5, 0x02 +; ret + +function %iadd_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; block0: +; vadd.vv v10,v11,v10 #avl=2, #vtype=(e64, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x05, 0xb5, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif new file mode 100644 index 0000000000..c0a8e85512 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif @@ -0,0 +1,97 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %load_i8x16(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 v0 + return v1 +} + +; VCode: +; block0: +; vle8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x07, 0x85, 0x0f, 0x02 +; ret + +function %load_i16x8(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 v0 + return v1 +} + +; VCode: +; block0: +; vle16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x07, 0xd5, 0x0f, 0x02 +; ret + +function %load_i32x4(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 v0 + return v1 +} + +; VCode: +; block0: +; vle32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x07, 0xe5, 0x0f, 0x02 +; ret + +function %load_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x07, 0xf5, 0x0f, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif new file mode 100644 index 0000000000..f969243c11 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif @@ -0,0 +1,97 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %store_i8x16(i64, i8x16) { +block0(v0: i64, v1: i8x16): + store.i8x16 v1, v0 + return +} + +; VCode: +; block0: +; vse8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0x85, 0x0f, 0x02 +; ret + +function %store_i16x8(i64, i16x8) { +block0(v0: i64, v1: i16x8): + store.i16x8 v1, v0 + return +} + +; VCode: +; block0: +; vse16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0xd5, 0x0f, 0x02 +; ret + +function %store_i32x4(i64, i32x4) { +block0(v0: i64, v1: i32x4): + store.i32x4 v1, v0 + return +} + +; VCode: +; block0: +; vse32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0xe5, 0x0f, 0x02 +; ret + +function %store_i64x2(i64, i64x2) { +block0(v0: i64, v1: i64x2): + store.i64x2 v1, v0 + return +} + +; VCode: +; block0: +; vse64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; auipc t6, 0 +; ld t6, 0xc(t6) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; add t6, t6, a0 +; .byte 0x27, 0xf5, 0x0f, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif new file mode 100644 index 0000000000..dcc7960693 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif @@ -0,0 +1,68 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +;; Interleaves vector operations to ensure that `vsetivli` is emitted +function %iadd_multi(i8x16, i16x8) -> i8x16, i16x8 { +block0(v0: i8x16, v1: i16x8): + v4 = iadd v0, v0 + v5 = iadd v1, v1 + v6 = iadd v5, v5 + return v4, v6 +} + +; VCode: +; block0: +; vadd.vv v10,v10,v10 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v5,v11,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vadd.vv v11,v5,v5 #avl=8, #vtype=(e16, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x05, 0xa5, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x82, 0xb5, 0x02 +; .byte 0xd7, 0x85, 0x52, 0x02 +; ret + +;; When the block changes, we need to reemit the vector state instruction +;; Even if vtype is the same. +function %(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd v0, v1 + jump block1(v1, v2) + +block1(v3: i8x16, v4: i8x16): + v5 = iadd v3, v4 + jump block2(v4, v5) + +block2(v6: i8x16, v7: i8x16): + v8 = iadd v6, v7 + return v8 +} + +; VCode: +; block0: +; vadd.vv v5,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma) +; j label1 +; block1: +; vadd.vv v6,v5,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; j label2 +; block2: +; vadd.vv v10,v6,v5 #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xd7, 0x02, 0xb5, 0x02 +; block1: ; offset 0x8 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x83, 0x55, 0x02 +; block2: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x85, 0x62, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index a150cf3986..ca620aa843 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -6,21 +6,6 @@ set enable_simd target x86_64 target x86_64 skylake -function %iadd_i32x4(i32x4, i32x4) -> i32x4 { -block0(v0:i32x4, v1:i32x4): - v2 = iadd v0, v1 - return v2 -} -; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5] - -function %iadd_i8x16_with_overflow() -> i8x16 { -block0: - v0 = vconst.i8x16 [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255] - v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] - v2 = iadd v0, v1 - return v2 -} -; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] function %isub_i32x4(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-iadd.clif b/cranelift/filetests/filetests/runtests/simd-iadd.clif new file mode 100644 index 0000000000..c1bfa50d18 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-iadd.clif @@ -0,0 +1,44 @@ +test interpret +test run +target aarch64 +target s390x +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + + +function %iadd_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0:i8x16, v1:i8x16): + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i8x16([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] +; run: %iadd_i8x16([2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + + +function %iadd_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0:i16x8, v1:i16x8): + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i16x8([1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8]) == [2 3 4 5 6 7 8 9] +; run: %iadd_i16x8([2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1] + + +function %iadd_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0:i32x4, v1:i32x4): + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5] +; run: %iadd_i32x4([2 2 2 2], [-1 -1 -1 -1]) == [1 1 1 1] + + +function %iadd_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0:i64x2, v1:i64x2): + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i64x2([1 1], [1 2]) == [2 3] +; run: %iadd_i64x2([2 2], [-1 -1]) == [1 1]