riscv64: Initial SIMD Vector Implementation (#6240)

* riscv64: Remove unused code * riscv64: Add vector types * riscv64: Initial Vector ABI Load/Stores * riscv64: Vector Loads/Stores * riscv64: Fix `vsetvli` encoding error * riscv64: Add SIMD `iadd` runtests * riscv64: Rename `VecSew` The SEW name is correct, but only for VType. We also use this type in loads/stores as the Efective Element Width, so the name isn't quite correct in that case. * ci: Add V extension to RISC-V QEMU * riscv64: Misc Cleanups * riscv64: Check V extension in `load`/`store` for SIMD * riscv64: Fix `sumop` doc comment * cranelift: Fix comment typo * riscv64: Add convert for VType and VecElementWidth * riscv64: Remove VecElementWidth converter
2023-04-20 22:54:43 +01:00
parent 7ad2fe32c9
commit 60e4a00413
22 changed files with 1945 additions and 137 deletions
--- a/ci/build-test-matrix.js
+++ b/ci/build-test-matrix.js
@@ -82,7 +82,7 @@ const array = [
    "target": "riscv64gc-unknown-linux-gnu",
    "gcc_package": "gcc-riscv64-linux-gnu",
    "gcc": "riscv64-linux-gnu-gcc",
-    "qemu": "qemu-riscv64 -cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu",
+    "qemu": "qemu-riscv64 -cpu rv64,v=true,vlen=256,vext_spec=v1.0,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu",
    "qemu_target": "riscv64-linux-user",
    "name": "Test Linux riscv64",
    "filter": "linux-riscv64",
--- a/cranelift/codegen/build.rs
+++ b/cranelift/codegen/build.rs
@@ -274,6 +274,7 @@ fn get_isle_compilations(
                    prelude_isle.clone(),
                    prelude_lower_isle.clone(),
                    src_isa_risc_v.join("inst.isle"),
                    src_isa_risc_v.join("inst_vector.isle"),
                    src_isa_risc_v.join("lower.isle"),
                ],
                untracked_inputs: vec![clif_lower_isle.clone()],
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -329,6 +329,31 @@
      (guard_size u32)
      (probe_count u32)
      (tmp WritableReg))
    (VecAluRRR
      (op VecAluOpRRR)
      (vd WritableReg)
      (vs1 Reg)
      (vs2 Reg)
      (vstate VState))
    (VecSetState
      (rd WritableReg)
      (vstate VState))
    (VecLoad
      (eew VecElementWidth)
      (to WritableReg)
      (from VecAMode)
      (flags MemFlags)
      (vstate VState))
    (VecStore
      (eew VecElementWidth)
      (to VecAMode)
      (from Reg)
      (flags MemFlags)
      (vstate VState))
 ))
@@ -711,6 +736,9 @@
 ;; ISA Extension helpers
 (decl pure has_v () bool)
 (extern constructor has_v has_v)
 (decl pure has_zbkb () bool)
 (extern constructor has_zbkb has_zbkb)
--- a/cranelift/codegen/src/isa/riscv64/inst/args.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/args.rs
@@ -1668,55 +1668,6 @@ impl CsrAddress {
    }
 }
 pub(crate) struct VType {
    vma: bool,
    vta: bool,
    vsew: Vsew,
    valmul: Vlmul,
 }
 impl VType {
    fn as_u32(self) -> u32 {
        self.valmul.as_u32()
            | self.vsew.as_u32() << 3
            | if self.vta { 1 << 7 } else { 0 }
            | if self.vma { 1 << 8 } else { 0 }
    }
    const fn vill_bit() -> u64 {
        1 << 63
    }
 }
 enum Vlmul {
    vlmul_1_div_8 = 0b101,
    vlmul_1_div_4 = 0b110,
    vlmul_1_div_2 = 0b111,
    vlmul_1 = 0b000,
    vlmul_2 = 0b001,
    vlmul_4 = 0b010,
    vlmul_8 = 0b011,
 }
 impl Vlmul {
    fn as_u32(self) -> u32 {
        self as u32
    }
 }
 enum Vsew {
    sew_8 = 0b000,
    sew_16 = 0b001,
    sew_32 = 0b010,
    sew_64 = 0b011,
 }
 impl Vsew {
    fn as_u32(self) -> u32 {
        self as u32
    }
 }
 impl CsrOP {
    pub(crate) fn op_name(self) -> &'static str {
        match self {
@@ -1754,40 +1705,11 @@ impl CsrOP {
        if self.need_rs() {
            reg_to_gpr_num(rs.unwrap())
        } else {
-            zimm.unwrap().as_u32()
+            zimm.unwrap().bits()
        }
    }
 }
 enum Vxrm {
    // round-to-nearest-up (add +0.5 LSB)
    rnu = 0b00,
    // round-to-nearest-even
    rne = 0b01,
    //round-down (truncate)
    rdn = 0b10,
    // round-to-odd (OR bits into LSB, aka "jam")
    rod = 0b11,
 }
 impl Vxrm {
    pub(crate) fn as_u32(self) -> u32 {
        self as u32
    }
 }
 pub(crate) struct Vcsr {
    xvrm: Vxrm,
    // Fixed-point accrued saturation flag
    vxsat: bool,
 }
 impl Vcsr {
    pub(crate) fn as_u32(self) -> u32 {
        return if self.vxsat { 1 } else { 0 } | self.xvrm.as_u32();
    }
 }
 ///Atomic Memory ordering.
 #[derive(Copy, Clone, Debug)]
 pub enum AMO {
--- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
@@ -102,6 +102,13 @@ pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 {
    u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
 }
 #[derive(Clone, Debug, PartialEq, Default)]
 pub enum EmitVState {
    #[default]
    Unknown,
    Known(VState),
 }
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -114,6 +121,9 @@ pub struct EmitState {
    /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
    /// optimized away at compiletime. See [cranelift_control].
    ctrl_plane: ControlPlane,
    /// Vector State
    /// Controls the current state of the vector unit at the emission point.
    vstate: EmitVState,
 }
 impl EmitState {
@@ -141,6 +151,7 @@ impl MachInstEmitState<Inst> for EmitState {
            stack_map: None,
            cur_srcloc: RelSourceLoc::default(),
            ctrl_plane,
            vstate: EmitVState::Unknown,
        }
    }
@@ -159,6 +170,11 @@ impl MachInstEmitState<Inst> for EmitState {
    fn take_ctrl_plane(self) -> ControlPlane {
        self.ctrl_plane
    }
    fn on_new_block(&mut self) {
        // Reset the vector state.
        self.vstate = EmitVState::Unknown;
    }
 }
 impl Inst {
@@ -386,6 +402,80 @@ impl Inst {
        }
        insts
    }
    /// Returns Some(VState) if this insturction is expecting a specific vector state
    /// before emission.
    fn expected_vstate(&self) -> Option<&VState> {
        match self {
            Inst::Nop0
            | Inst::Nop4
            | Inst::BrTable { .. }
            | Inst::Auipc { .. }
            | Inst::Lui { .. }
            | Inst::LoadConst32 { .. }
            | Inst::LoadConst64 { .. }
            | Inst::AluRRR { .. }
            | Inst::FpuRRR { .. }
            | Inst::AluRRImm12 { .. }
            | Inst::Load { .. }
            | Inst::Store { .. }
            | Inst::Args { .. }
            | Inst::Ret { .. }
            | Inst::Extend { .. }
            | Inst::AjustSp { .. }
            | Inst::Call { .. }
            | Inst::CallInd { .. }
            | Inst::TrapIf { .. }
            | Inst::Jal { .. }
            | Inst::CondBr { .. }
            | Inst::LoadExtName { .. }
            | Inst::LoadAddr { .. }
            | Inst::VirtualSPOffsetAdj { .. }
            | Inst::Mov { .. }
            | Inst::MovFromPReg { .. }
            | Inst::Fence { .. }
            | Inst::FenceI
            | Inst::ECall
            | Inst::EBreak
            | Inst::Udf { .. }
            | Inst::FpuRR { .. }
            | Inst::FpuRRRR { .. }
            | Inst::Jalr { .. }
            | Inst::Atomic { .. }
            | Inst::Select { .. }
            | Inst::AtomicCas { .. }
            | Inst::IntSelect { .. }
            | Inst::Csr { .. }
            | Inst::Icmp { .. }
            | Inst::SelectReg { .. }
            | Inst::FcvtToInt { .. }
            | Inst::RawData { .. }
            | Inst::AtomicStore { .. }
            | Inst::AtomicLoad { .. }
            | Inst::AtomicRmwLoop { .. }
            | Inst::TrapIfC { .. }
            | Inst::Unwind { .. }
            | Inst::DummyUse { .. }
            | Inst::FloatRound { .. }
            | Inst::FloatSelect { .. }
            | Inst::FloatSelectPseudo { .. }
            | Inst::Popcnt { .. }
            | Inst::Rev8 { .. }
            | Inst::Cltz { .. }
            | Inst::Brev8 { .. }
            | Inst::StackProbeLoop { .. } => None,
            // VecSetState does not expect any vstate, rather it updates it.
            Inst::VecSetState { .. } => None,
            Inst::VecAluRRR { vstate, .. } |
            // TODO: Unit-stride loads and stores only need the AVL to be correct, not
            // the full vtype. A future optimization could be to decouple these two when
            // updating vstate. This would allow us to avoid emitting a VecSetState in
            // some cases.
            Inst::VecLoad { vstate, .. }
            | Inst::VecStore { vstate, .. } => Some(vstate),
        }
    }
 }
 impl MachInstEmit for Inst {
@@ -400,6 +490,19 @@ impl MachInstEmit for Inst {
        state: &mut EmitState,
    ) {
        let mut allocs = AllocationConsumer::new(allocs);
        // Check if we need to update the vector state before emitting this instruction
        if let Some(expected) = self.expected_vstate() {
            if state.vstate != EmitVState::Known(expected.clone()) {
                // Update the vector state.
                Inst::VecSetState {
                    rd: writable_zero_reg(),
                    vstate: expected.clone(),
                }
                .emit(&[], sink, emit_info, state);
            }
        }
        // N.B.: we *must* not exceed the "worst-case size" used to compute
        // where to insert islands, except when islands are explicitly triggered
        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
@@ -530,13 +633,14 @@ impl MachInstEmit for Inst {
                    (rs1, rs2)
                };
-                let x: u32 = alu_op.op_code()
+                sink.put4(encode_r_type(
-                    | reg_to_gpr_num(rd.to_reg()) << 7
+                    alu_op.op_code(),
-                    | (alu_op.funct3()) << 12
+                    rd.to_reg(),
-                    | reg_to_gpr_num(rs1) << 15
+                    alu_op.funct3(),
-                    | reg_to_gpr_num(rs2) << 20
+                    rs1,
-                    | alu_op.funct7() << 25;
+                    rs2,
-                sink.put4(x);
+                    alu_op.funct7(),
                ));
            }
            &Inst::AluRRImm12 {
                alu_op,
@@ -2695,6 +2799,120 @@ impl MachInstEmit for Inst {
                .emit(&[], sink, emit_info, state);
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::VecAluRRR {
                op, vd, vs1, vs2, ..
            } => {
                let vs1 = allocs.next(vs1);
                let vs2 = allocs.next(vs2);
                let vd = allocs.next_writable(vd);
                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
                // masking disabled.
                let vm = 1;
                sink.put4(encode_valu(
                    op.opcode(),
                    vd.to_reg(),
                    op.funct3(),
                    vs1,
                    vs2,
                    vm,
                    op.funct6(),
                ));
            }
            &Inst::VecSetState { rd, ref vstate } => {
                let rd = allocs.next_writable(rd);
                sink.put4(encode_vcfg_imm(
                    0x57,
                    rd.to_reg(),
                    vstate.avl.unwrap_static(),
                    &vstate.vtype,
                ));
                // Update the current vector emit state.
                state.vstate = EmitVState::Known(vstate.clone());
            }
            &Inst::VecLoad {
                eew,
                to,
                ref from,
                flags,
                ..
            } => {
                let offset = from.get_offset_with_state(state);
                let from_reg = allocs.next(from.get_base_register());
                let to = allocs.next_writable(to);
                // Vector Loads don't support immediate offsets, so we need to load it into a register.
                let addr = writable_spilltmp_reg();
                LoadConstant::U64(offset as u64)
                    .load_constant_and_add(addr, from_reg)
                    .into_iter()
                    .for_each(|inst| inst.emit(&[], sink, emit_info, state));
                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
                // masking disabled.
                let vm = 1;
                sink.put4(encode_vmem_load(
                    0x07,
                    to.to_reg(),
                    eew,
                    addr.to_reg(),
                    from.lumop(),
                    vm,
                    from.mop(),
                    from.nf(),
                ));
            }
            &Inst::VecStore {
                eew,
                ref to,
                from,
                flags,
                ..
            } => {
                let offset = to.get_offset_with_state(state);
                let to_reg = allocs.next(to.get_base_register());
                let from = allocs.next(from);
                // Vector Stores don't support immediate offsets, so we need to load it into a register.
                let addr = writable_spilltmp_reg();
                LoadConstant::U64(offset as u64)
                    .load_constant_and_add(addr, to_reg)
                    .into_iter()
                    .for_each(|inst| inst.emit(&[], sink, emit_info, state));
                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
                // masking disabled.
                let vm = 1;
                sink.put4(encode_vmem_store(
                    0x27,
                    from,
                    eew,
                    addr.to_reg(),
                    to.sumop(),
                    vm,
                    to.mop(),
                    to.nf(),
                ));
            }
        };
        let end_off = sink.cur_offset();
        assert!(
--- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs
@@ -0,0 +1,128 @@
 //! Contains the RISC-V instruction encoding logic.
 //!
 //! These formats are specified in the RISC-V specification in section 2.2.
 //! See: https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
 //!
 //! Some instructions especially in extensions have slight variations from
 //! the base RISC-V specification.
 use super::{UImm5, VType};
 use crate::isa::riscv64::inst::reg_to_gpr_num;
 use crate::isa::riscv64::lower::isle::generated_code::VecElementWidth;
 use crate::Reg;
 /// Encode an R-type instruction.
 ///
 /// Layout:
 /// 0-------6-7-------11-12------14-15------19-20------24-25-------31
 /// | Opcode |   rd     |  funct3  |   rs1    |   rs2    |   funct7  |
 pub fn encode_r_type(opcode: u32, rd: Reg, funct3: u32, rs1: Reg, rs2: Reg, funct7: u32) -> u32 {
    let mut bits = 0;
    bits |= opcode & 0b1111111;
    bits |= reg_to_gpr_num(rd) << 7;
    bits |= (funct3 & 0b111) << 12;
    bits |= reg_to_gpr_num(rs1) << 15;
    bits |= reg_to_gpr_num(rs2) << 20;
    bits |= (funct7 & 0b1111111) << 25;
    bits
 }
 /// Encodes a Vector ALU instruction.
 ///
 /// Fields:
 /// - opcode (7 bits)
 /// - vd     (5 bits)
 /// - funct3 (3 bits)
 /// - vs1    (5 bits)
 /// - vs2    (5 bits)
 /// - vm     (1 bit)
 /// - funct6 (6 bits)
 ///
 /// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc
 pub fn encode_valu(
    opcode: u32,
    vd: Reg,
    funct3: u32,
    vs1: Reg,
    vs2: Reg,
    vm: u32,
    funct6: u32,
 ) -> u32 {
    let funct6 = funct6 & 0b111111;
    let vm = vm & 0b1;
    let funct7 = (funct6 << 6) | vm;
    encode_r_type(opcode, vd, funct3, vs1, vs2, funct7)
 }
 /// Encodes a Vector CFG Imm instruction.
 ///
 /// See: https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
 // TODO: Check if this is any of the known instruction types in the spec.
 pub fn encode_vcfg_imm(opcode: u32, rd: Reg, imm: UImm5, vtype: &VType) -> u32 {
    let mut bits = 0;
    bits |= opcode & 0b1111111;
    bits |= reg_to_gpr_num(rd) << 7;
    bits |= 0b111 << 12;
    bits |= (imm.bits() & 0b11111) << 15;
    bits |= (vtype.encode() & 0b1111111111) << 20;
    bits |= 0b11 << 30;
    bits
 }
 /// Encodes a Vector Mem Unit Stride Load instruction.
 ///
 /// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc
 /// TODO: These instructions share opcode space with LOAD-FP and STORE-FP
 pub fn encode_vmem_load(
    opcode: u32,
    vd: Reg,
    width: VecElementWidth,
    rs1: Reg,
    lumop: u32,
    vm: u32,
    mop: u32,
    nf: u32,
 ) -> u32 {
    // Width is encoded differently to avoid a clash with the FP load/store sizes.
    let width = match width {
        VecElementWidth::E8 => 0b000,
        VecElementWidth::E16 => 0b101,
        VecElementWidth::E32 => 0b110,
        VecElementWidth::E64 => 0b111,
    };
    let mut bits = 0;
    bits |= opcode & 0b1111111;
    bits |= reg_to_gpr_num(vd) << 7;
    bits |= width << 12;
    bits |= reg_to_gpr_num(rs1) << 15;
    bits |= (lumop & 0b11111) << 20;
    bits |= (vm & 0b1) << 25;
    bits |= (mop & 0b11) << 26;
    // The mew bit (inst[28]) when set is expected to be used to encode expanded
    // memory sizes of 128 bits and above, but these encodings are currently reserved.
    bits |= 0b0 << 28;
    bits |= (nf & 0b111) << 29;
    bits
 }
 /// Encodes a Vector Mem Unit Stride Load instruction.
 ///
 /// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc
 /// TODO: These instructions share opcode space with LOAD-FP and STORE-FP
 pub fn encode_vmem_store(
    opcode: u32,
    vs3: Reg,
    width: VecElementWidth,
    rs1: Reg,
    sumop: u32,
    vm: u32,
    mop: u32,
    nf: u32,
 ) -> u32 {
    // This is pretty much the same as the load instruction, just
    // with different names on the fields.
    encode_vmem_load(opcode, vs3, width, rs1, sumop, vm, mop, nf)
 }
--- a/cranelift/codegen/src/isa/riscv64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/imms.rs
@@ -99,33 +99,31 @@ impl Display for Imm20 {
    }
 }
-#[derive(Clone, Copy)]
+/// An unsigned 5-bit immediate.
-pub struct Uimm5 {
+#[derive(Clone, Copy, Debug, PartialEq)]
-    bits: u8,
+pub struct UImm5 {
    value: u8,
 }
-impl Uimm5 {
+impl UImm5 {
-    pub fn from_bits(bits: u8) -> Self {
+    /// Create an unsigned 5-bit immediate from u8.
-        Self { bits }
+    pub fn maybe_from_u8(value: u8) -> Option<UImm5> {
        if value < 32 {
            Some(UImm5 { value })
        } else {
            None
        }
    /// Create a zero immediate of this format.
    pub fn zero() -> Self {
        Self { bits: 0 }
    }
-    pub fn as_u32(&self) -> u32 {
+
-        (self.bits as u32) & 0b1_1111
+    /// Bits for encoding.
    pub fn bits(&self) -> u32 {
        u32::from(self.value)
    }
 }
-impl Debug for Uimm5 {
+impl Display for UImm5 {
    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        write!(f, "{}", self.bits)
+        write!(f, "{}", self.value)
    }
 }
 impl Display for Uimm5 {
    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
        write!(f, "{}", self.bits)
    }
 }
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -4,9 +4,10 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
 use super::lower::isle::generated_code::{VecAMode, VecElementWidth};
 use crate::binemit::{Addend, CodeOffset, Reloc};
 pub use crate::ir::condcodes::IntCC;
-use crate::ir::types::{F32, F64, I128, I16, I32, I64, I8, R32, R64};
+use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, R32, R64};
 pub use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, Type, ValueLabel};
 use crate::isa::CallConv;
@@ -29,6 +30,10 @@ pub mod args;
 pub use self::args::*;
 pub mod emit;
 pub use self::emit::*;
 pub mod vector;
 pub use self::vector::*;
 pub mod encode;
 pub use self::encode::*;
 pub mod unwind;
 use crate::isa::riscv64::abi::Riscv64MachineDeps;
@@ -41,7 +46,7 @@ use std::fmt::{Display, Formatter};
 pub(crate) type OptionReg = Option<Reg>;
 pub(crate) type OptionImm12 = Option<Imm12>;
 pub(crate) type VecBranchTarget = Vec<BranchTarget>;
-pub(crate) type OptionUimm5 = Option<Uimm5>;
+pub(crate) type OptionUimm5 = Option<UImm5>;
 pub(crate) type OptionFloatRoundingMode = Option<FRM>;
 pub(crate) type VecU8 = Vec<u8>;
 pub(crate) type VecWritableReg = Vec<Writable<Reg>>;
@@ -313,6 +318,15 @@ impl Inst {
    /// Generic constructor for a load (zero-extending where appropriate).
    pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type, flags: MemFlags) -> Inst {
        if ty.is_vector() {
            Inst::VecLoad {
                eew: VecElementWidth::from_type(ty),
                to: into_reg,
                from: VecAMode::UnitStride { base: mem },
                flags,
                vstate: VState::from_type(ty),
            }
        } else {
            Inst::Load {
                rd: into_reg,
                op: LoadOP::from_type(ty),
@@ -320,9 +334,19 @@ impl Inst {
                flags,
            }
        }
    }
    /// Generic constructor for a store.
    pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst {
        if ty.is_vector() {
            Inst::VecStore {
                eew: VecElementWidth::from_type(ty),
                to: VecAMode::UnitStride { base: mem },
                from: from_reg,
                flags,
                vstate: VState::from_type(ty),
            }
        } else {
            Inst::Store {
                src: from_reg,
                op: StoreOP::from_type(ty),
@@ -330,6 +354,7 @@ impl Inst {
                flags,
            }
        }
    }
 }
 //=============================================================================
@@ -623,6 +648,22 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            // gen_prologue is called at emit stage.
            // no need let reg alloc know.
        }
        &Inst::VecAluRRR { vd, vs1, vs2, .. } => {
            collector.reg_use(vs1);
            collector.reg_use(vs2);
            collector.reg_def(vd);
        }
        &Inst::VecSetState { rd, .. } => {
            collector.reg_def(rd);
        }
        &Inst::VecLoad { to, ref from, .. } => {
            collector.reg_use(from.get_base_register());
            collector.reg_def(to);
        }
        &Inst::VecStore { ref to, from, .. } => {
            collector.reg_use(to.get_base_register());
            collector.reg_use(from);
        }
    }
 }
@@ -727,6 +768,7 @@ impl MachInst for Inst {
            F32 => Ok((&[RegClass::Float], &[F32])),
            F64 => Ok((&[RegClass::Float], &[F64])),
            I128 => Ok((&[RegClass::Int, RegClass::Int], &[I64, I64])),
            _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])),
            _ => Err(CodegenError::Unsupported(format!(
                "Unexpected SSA-value type: {}",
                ty
@@ -784,6 +826,17 @@ pub fn reg_name(reg: Reg) -> String {
        }
    }
 }
 pub fn vec_reg_name(reg: Reg) -> String {
    match reg.to_real_reg() {
        Some(real) => {
            assert_eq!(real.class(), RegClass::Float);
            format!("v{}", real.hw_enc())
        }
        None => {
            format!("{:?}", reg)
        }
    }
 }
 impl Inst {
    fn print_with_state(
@@ -795,6 +848,16 @@ impl Inst {
            let reg = allocs.next(reg);
            reg_name(reg)
        };
        let format_vec_reg = |reg: Reg, allocs: &mut AllocationConsumer<'_>| -> String {
            let reg = allocs.next(reg);
            vec_reg_name(reg)
        };
        let format_vec_amode = |amode: &VecAMode, allocs: &mut AllocationConsumer<'_>| -> String {
            match amode {
                VecAMode::UnitStride { base } => base.to_string_with_alloc(allocs),
            }
        };
        let format_regs = |regs: &[Reg], allocs: &mut AllocationConsumer<'_>| -> String {
            let mut x = if regs.len() > 1 {
@@ -839,6 +902,7 @@ impl Inst {
                "".into()
            }
        }
        match self {
            &Inst::Nop0 => {
                format!("##zero length nop")
@@ -1501,6 +1565,48 @@ impl Inst {
            &MInst::Udf { trap_code } => format!("udf##trap_code={}", trap_code),
            &MInst::EBreak {} => String::from("ebreak"),
            &MInst::ECall {} => String::from("ecall"),
            &Inst::VecAluRRR {
                op,
                vd,
                vs1,
                vs2,
                ref vstate,
            } => {
                let vs1_s = format_vec_reg(vs1, allocs);
                let vs2_s = format_vec_reg(vs2, allocs);
                let vd_s = format_vec_reg(vd.to_reg(), allocs);
                // Note: vs2 and vs1 here are opposite to the standard scalar ordering.
                // This is noted in Section 10.1 of the RISC-V Vector spec.
                format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate)
            }
            &Inst::VecSetState { rd, ref vstate } => {
                let rd_s = format_reg(rd.to_reg(), allocs);
                assert!(vstate.avl.is_static());
                format!("vsetivli {}, {}, {}", rd_s, vstate.avl, vstate.vtype)
            }
            Inst::VecLoad {
                eew,
                to,
                from,
                ref vstate,
                ..
            } => {
                let base = format_vec_amode(from, allocs);
                let vd = format_vec_reg(to.to_reg(), allocs);
                format!("vl{}.v {},{} {}", eew, vd, base, vstate)
            }
            Inst::VecStore {
                eew,
                to,
                from,
                ref vstate,
                ..
            } => {
                let dst = format_vec_amode(to, allocs);
                let vs3 = format_vec_reg(*from, allocs);
                format!("vs{}.v {},{} {}", eew, vs3, dst, vstate)
            }
        }
    }
 }
--- a/cranelift/codegen/src/isa/riscv64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/regs.rs
@@ -218,3 +218,13 @@ pub(crate) fn x_reg_range(start: usize, end: usize) -> Vec<Writable<Reg>> {
    }
    regs
 }
 #[inline]
 pub fn v_reg(enc: usize) -> Reg {
    let p_reg = PReg::new(enc, RegClass::Float);
    let v_reg = VReg::new(p_reg.index(), p_reg.class());
    Reg::from(v_reg)
 }
 pub fn vx_reg(enc: usize) -> PReg {
    PReg::new(enc, RegClass::Float)
 }
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -0,0 +1,289 @@
 use crate::isa::riscv64::inst::EmitState;
 use crate::isa::riscv64::lower::isle::generated_code::{
    VecAMode, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode, VecTailMode,
 };
 use crate::Reg;
 use core::fmt;
 use super::{Type, UImm5};
 impl VecAvl {
    pub fn _static(size: u32) -> Self {
        VecAvl::Static {
            size: UImm5::maybe_from_u8(size as u8).expect("Invalid size for AVL"),
        }
    }
    pub fn is_static(&self) -> bool {
        match self {
            VecAvl::Static { .. } => true,
        }
    }
    pub fn unwrap_static(&self) -> UImm5 {
        match self {
            VecAvl::Static { size } => *size,
        }
    }
 }
 // TODO: Can we tell ISLE to derive this?
 impl PartialEq for VecAvl {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (VecAvl::Static { size: lhs }, VecAvl::Static { size: rhs }) => lhs == rhs,
        }
    }
 }
 impl fmt::Display for VecAvl {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            VecAvl::Static { size } => write!(f, "{}", size),
        }
    }
 }
 impl VecElementWidth {
    pub fn from_type(ty: Type) -> Self {
        Self::from_bits(ty.lane_bits())
    }
    pub fn from_bits(bits: u32) -> Self {
        match bits {
            8 => VecElementWidth::E8,
            16 => VecElementWidth::E16,
            32 => VecElementWidth::E32,
            64 => VecElementWidth::E64,
            _ => panic!("Invalid number of bits for VecElementWidth: {}", bits),
        }
    }
    pub fn bits(&self) -> u32 {
        match self {
            VecElementWidth::E8 => 8,
            VecElementWidth::E16 => 16,
            VecElementWidth::E32 => 32,
            VecElementWidth::E64 => 64,
        }
    }
    pub fn encode(&self) -> u32 {
        match self {
            VecElementWidth::E8 => 0b000,
            VecElementWidth::E16 => 0b001,
            VecElementWidth::E32 => 0b010,
            VecElementWidth::E64 => 0b011,
        }
    }
 }
 impl fmt::Display for VecElementWidth {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "e{}", self.bits())
    }
 }
 impl VecLmul {
    pub fn encode(&self) -> u32 {
        match self {
            VecLmul::LmulF8 => 0b101,
            VecLmul::LmulF4 => 0b110,
            VecLmul::LmulF2 => 0b111,
            VecLmul::Lmul1 => 0b000,
            VecLmul::Lmul2 => 0b001,
            VecLmul::Lmul4 => 0b010,
            VecLmul::Lmul8 => 0b011,
        }
    }
 }
 impl fmt::Display for VecLmul {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            VecLmul::LmulF8 => write!(f, "mf8"),
            VecLmul::LmulF4 => write!(f, "mf4"),
            VecLmul::LmulF2 => write!(f, "mf2"),
            VecLmul::Lmul1 => write!(f, "m1"),
            VecLmul::Lmul2 => write!(f, "m2"),
            VecLmul::Lmul4 => write!(f, "m4"),
            VecLmul::Lmul8 => write!(f, "m8"),
        }
    }
 }
 impl VecTailMode {
    pub fn encode(&self) -> u32 {
        match self {
            VecTailMode::Agnostic => 1,
            VecTailMode::Undisturbed => 0,
        }
    }
 }
 impl fmt::Display for VecTailMode {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            VecTailMode::Agnostic => write!(f, "ta"),
            VecTailMode::Undisturbed => write!(f, "tu"),
        }
    }
 }
 impl VecMaskMode {
    pub fn encode(&self) -> u32 {
        match self {
            VecMaskMode::Agnostic => 1,
            VecMaskMode::Undisturbed => 0,
        }
    }
 }
 impl fmt::Display for VecMaskMode {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            VecMaskMode::Agnostic => write!(f, "ma"),
            VecMaskMode::Undisturbed => write!(f, "mu"),
        }
    }
 }
 /// Vector Type (VType)
 ///
 /// vtype provides the default type used to interpret the contents of the vector register file.
 #[derive(Clone, Debug, PartialEq)]
 pub struct VType {
    pub sew: VecElementWidth,
    pub lmul: VecLmul,
    pub tail_mode: VecTailMode,
    pub mask_mode: VecMaskMode,
 }
 impl VType {
    // https://github.com/riscv/riscv-v-spec/blob/master/vtype-format.adoc
    pub fn encode(&self) -> u32 {
        let mut bits = 0;
        bits |= self.lmul.encode();
        bits |= self.sew.encode() << 3;
        bits |= self.tail_mode.encode() << 6;
        bits |= self.mask_mode.encode() << 7;
        bits
    }
 }
 impl fmt::Display for VType {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(
            f,
            "{}, {}, {}, {}",
            self.sew, self.lmul, self.tail_mode, self.mask_mode
        )
    }
 }
 /// Vector State (VState)
 ///
 /// VState represents the state of the vector unit that each instruction expects before execution.
 /// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is
 /// used by our instruction emission code to ensure that the vector unit is in the correct state.
 #[derive(Clone, Debug, PartialEq)]
 pub struct VState {
    pub avl: VecAvl,
    pub vtype: VType,
 }
 impl VState {
    pub fn from_type(ty: Type) -> Self {
        VState {
            avl: VecAvl::_static(ty.lane_count()),
            vtype: VType {
                sew: VecElementWidth::from_type(ty),
                lmul: VecLmul::Lmul1,
                tail_mode: VecTailMode::Agnostic,
                mask_mode: VecMaskMode::Agnostic,
            },
        }
    }
 }
 impl fmt::Display for VState {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "#avl={}, #vtype=({})", self.avl, self.vtype)
    }
 }
 impl VecAluOpRRR {
    pub fn opcode(&self) -> u32 {
        match self {
            VecAluOpRRR::Vadd => 0x57,
        }
    }
    pub fn funct3(&self) -> u32 {
        match self {
            VecAluOpRRR::Vadd => 0b000,
        }
    }
    pub fn funct6(&self) -> u32 {
        match self {
            VecAluOpRRR::Vadd => 0b000000,
        }
    }
 }
 impl fmt::Display for VecAluOpRRR {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            VecAluOpRRR::Vadd => write!(f, "vadd.vv"),
        }
    }
 }
 impl VecAMode {
    pub fn get_base_register(&self) -> Reg {
        match self {
            VecAMode::UnitStride { base, .. } => base.get_base_register(),
        }
    }
    pub(crate) fn get_offset_with_state(&self, state: &EmitState) -> i64 {
        match self {
            VecAMode::UnitStride { base, .. } => base.get_offset_with_state(state),
        }
    }
    /// `mop` field, described in Table 7 of Section 7.2. Vector Load/Store Addressing Modes
    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
    pub fn mop(&self) -> u32 {
        match self {
            VecAMode::UnitStride { .. } => 0b00,
        }
    }
    /// `lumop` field, described in Table 9 of Section 7.2. Vector Load/Store Addressing Modes
    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
    pub fn lumop(&self) -> u32 {
        match self {
            VecAMode::UnitStride { .. } => 0b00000,
        }
    }
    /// `sumop` field, described in Table 10 of Section 7.2. Vector Load/Store Addressing Modes
    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
    pub fn sumop(&self) -> u32 {
        match self {
            VecAMode::UnitStride { .. } => 0b00000,
        }
    }
    /// The `nf[2:0]` field encodes the number of fields in each segment. For regular vector loads and
    /// stores, nf=0, indicating that a single value is moved between a vector register group and memory
    /// at each element position. Larger values in the nf field are used to access multiple contiguous
    /// fields within a segment as described in Section 7.8 Vector Load/Store Segment Instructions.
    ///
    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
    pub fn nf(&self) -> u32 {
        match self {
            VecAMode::UnitStride { .. } => 0b000,
        }
    }
 }
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -0,0 +1,132 @@
 ;; Represents the possible widths of an element when used in an operation.
 (type VecElementWidth (enum
  (E8)
  (E16)
  (E32)
  (E64)
 ))
 ;; Vector Register Group Multiplier (LMUL)
 ;;
 ;; The LMUL setting specifies how we should group registers together. LMUL can
 ;; also be a fractional value, reducing the number of bits used in a single
 ;; vector register. Fractional LMUL is used to increase the number of effective
 ;; usable vector register groups when operating on mixed-width values.
 (type VecLmul (enum
  (LmulF8)
  (LmulF4)
  (LmulF2)
  (Lmul1)
  (Lmul2)
  (Lmul4)
  (Lmul8)
 ))
 ;; Tail Mode
 ;;
 ;; The tail mode specifies how the tail elements of a vector register are handled.
 (type VecTailMode (enum
  ;; Tail Agnostic means that the tail elements are left in an undefined state.
  (Agnostic)
  ;; Tail Undisturbed means that the tail elements are left in their original values.
  (Undisturbed)
 ))
 ;; Mask Mode
 ;;
 ;; The mask mode specifies how the masked elements of a vector register are handled.
 (type VecMaskMode (enum
  ;; Mask Agnostic means that the masked out elements are left in an undefined state.
  (Agnostic)
  ;; Mask Undisturbed means that the masked out elements are left in their original values.
  (Undisturbed)
 ))
 ;; Application Vector Length (AVL)
 ;;
 ;; This setting specifies the number of elements that are going to be processed
 ;; in a single instruction. Note: We may end up processing fewer elements than
 ;; the AVL setting, if they don't fit in a single register.
 (type VecAvl (enum
  ;; Static AVL emits a `vsetivli` that uses a constant value
  (Static (size UImm5))
  ;; TODO: Add a dynamic, register based AVL mode when we are able to properly test it
 ))
 (type VType (primitive VType))
 (type VState (primitive VState))
 ;; Register to Register ALU Ops
 (type VecAluOpRRR (enum
  (Vadd)
 ))
 ;; Vector Addressing Mode
 (type VecAMode (enum
  ;; Vector unit-stride operations access elements stored contiguously in memory
  ;; starting from the base effective address.
  (UnitStride
    (base AMode))
  ;; TODO: Constant Stride
  ;; TODO: Indexed Operations
 ))
 ;; Builds a static VState matching a SIMD type.
 ;; The VState is guaranteed to be static with AVL set to the number of lanes.
 ;; Element size is set to the size of the type.
 ;; LMUL is set to 1.
 ;; Tail mode is set to agnostic.
 ;; Mask mode is set to agnostic.
 (decl pure vstate_from_type (Type) VState)
 (extern constructor vstate_from_type vstate_from_type)
 (convert Type VState vstate_from_type)
 ;; Extracts an element width from a SIMD type.
 (decl pure element_width_from_type (Type) VecElementWidth)
 (rule (element_width_from_type ty)
  (if-let $I8 (lane_type ty))
  (VecElementWidth.E8))
 (rule (element_width_from_type ty)
  (if-let $I16 (lane_type ty))
  (VecElementWidth.E16))
 (rule (element_width_from_type ty)
  (if-let $I32 (lane_type ty))
  (VecElementWidth.E32))
 (rule (element_width_from_type ty)
  (if-let $I64 (lane_type ty))
  (VecElementWidth.E64))
 ;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; As noted in the RISC-V Vector Extension Specification, rs2 is the first
 ;; source register and rs1 is the second source register. This is the opposite
 ;; of the usual RISC-V register order.
 ;; See Section 10.1 of the RISC-V Vector Extension Specification.
 ;; Helper for emitting `MInst.VecAluRRR` instructions.
 (decl vec_alu_rrr (VecAluOpRRR Reg Reg VState) Reg)
 (rule (vec_alu_rrr op vs2 vs1 vstate)
      (let ((vd WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 vstate))))
        vd))
 ;; Helper for emitting `MInst.VecLoad` instructions.
 (decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg)
 (rule (vec_load eew from flags vstate)
      (let ((vd WritableReg (temp_writable_reg $I8X16))
            (_ Unit (emit (MInst.VecLoad eew vd from flags vstate))))
        vd))
 ;; Helper for emitting `MInst.VecStore` instructions.
 (decl vec_store (VecElementWidth VecAMode Reg MemFlags VState) InstOutput)
 (rule (vec_store eew to from flags vstate)
      (side_effect
        (SideEffectNoResult.Inst (MInst.VecStore eew to from flags vstate))))
 ;; Helper for emitting the `vadd.vv` instruction.
 (decl rv_vadd_vv (Reg Reg VState) Reg)
 (rule (rv_vadd_vv vs2 vs1 vstate)
  (vec_alu_rrr (VecAluOpRRR.Vadd) vs2 vs1 vstate))
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -97,6 +97,11 @@
        (high Reg (rv_add high_tmp carry)))
    (value_regs low high)))
 ;; SIMD Vectors
 (rule 8 (lower (has_type (ty_vec128_int ty) (iadd x y)))
  (if-let $true (has_v))
  (rv_vadd_vv x y ty))
 ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;
 (rule
  (lower (has_type (fits_in_64 ty) (uadd_overflow_trap x y tc)))
@@ -809,6 +814,12 @@
  (lower (has_type $I128 (load flags p @ (value_type (ty_addr64 _)) offset)))
  (gen_load_128 p offset flags))
 (rule 2
  (lower (has_type (ty_vec128_int ty) (load flags p @ (value_type (ty_addr64 _)) offset)))
  (if-let $true (has_v))
  (let ((eew VecElementWidth (element_width_from_type ty)))
    (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags ty)))
 ;;;;;  Rules for `istore8`;;;;;;;;;
 (rule
  (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset))
@@ -833,6 +844,12 @@
  (lower (store flags x @ (value_type $I128 ) p @ (value_type (ty_addr64 _)) offset))
  (gen_store_128 p offset flags x))
 (rule 2
  (lower (store flags x @ (value_type (ty_vec128_int ty)) p @ (value_type (ty_addr64 _)) offset))
  (if-let $true (has_v))
  (let ((eew VecElementWidth (element_width_from_type ty)))
    (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags ty)))
 (decl gen_icmp (IntCC ValueRegs ValueRegs Type) Reg)
 (rule
  (gen_icmp cc x y ty)
--- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -283,6 +283,10 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> {
        ValueRegs::two(shamt, len_sub_shamt)
    }
    fn has_v(&mut self) -> bool {
        self.backend.isa_flags.has_v()
    }
    fn has_zbkb(&mut self) -> bool {
        self.backend.isa_flags.has_zbkb()
    }
@@ -428,6 +432,11 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> {
            rs2,
        }
    }
    #[inline]
    fn vstate_from_type(&mut self, ty: Type) -> VState {
        VState::from_type(ty)
    }
 }
 impl IsleContext<'_, '_, MInst, Riscv64Backend> {
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -288,6 +288,9 @@ pub trait MachInstEmitState<I: VCodeInst>: Default + Clone + Debug {
    /// Used to continue using a control plane after the emission state is
    /// not needed anymore.
    fn take_ctrl_plane(self) -> ControlPlane;
    /// A hook that triggers when first emitting a new block.
    /// It is guaranteed to be called before any instructions are emitted.
    fn on_new_block(&mut self) {}
 }
 /// The result of a `MachBackend::compile_function()` call. Contains machine
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -843,6 +843,11 @@ impl<I: VCodeInst> VCode<I> {
        for (block_order_idx, &block) in final_order.iter().enumerate() {
            trace!("emitting block {:?}", block);
            // Call the new block hook for state
            state.on_new_block();
            // Emit NOPs to align the block.
            let new_offset = I::align_basic_block(buffer.cur_offset());
            while new_offset > buffer.cur_offset() {
                // Pad with NOPs up to the aligned block offset.
--- a/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif
@@ -0,0 +1,578 @@
 test compile precise-output
 target riscv64 has_v
 ;; Tests both ABI and Regalloc spill/reload.
 function %simd_spill(
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    ;; These cannot fit in registers.
    i32x4, i32x4
 ) ->
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
    ;; These cannot fit in registers.
    i32x4, i32x4 system_v
 {
 block0(
    v0:i32x4, v1:i32x4, v2:i32x4, v3:i32x4, v4:i32x4, v5:i32x4, v6:i32x4, v7:i32x4,
    v8:i32x4, v9:i32x4, v10:i32x4, v11:i32x4, v12:i32x4, v13:i32x4, v14:i32x4, v15:i32x4,
    v16:i32x4, v17:i32x4, v18:i32x4, v19:i32x4, v20:i32x4, v21:i32x4, v22:i32x4, v23:i32x4,
    v24:i32x4, v25:i32x4, v26:i32x4, v27:i32x4, v28:i32x4, v29:i32x4, v30:i32x4, v31:i32x4,
    v32:i32x4, v33:i32x4
 ):
    ;; This just reverses the args
    return v33, v32,
           v31, v30, v29, v28, v27, v26, v25, v24,
           v23, v22, v21, v20, v19, v18, v17, v16,
           v15, v14, v13, v12, v11, v10, v9, v8,
           v7, v6, v5, v4, v3, v2, v1, v0
 }
 ; VCode:
 ;   add sp,-16
 ;   sd ra,8(sp)
 ;   sd fp,0(sp)
 ;   mv fp,sp
 ;   fsd fs0,-8(sp)
 ;   fsd fs2,-16(sp)
 ;   fsd fs3,-24(sp)
 ;   fsd fs4,-32(sp)
 ;   fsd fs5,-40(sp)
 ;   fsd fs6,-48(sp)
 ;   fsd fs7,-56(sp)
 ;   fsd fs8,-64(sp)
 ;   fsd fs9,-72(sp)
 ;   fsd fs10,-80(sp)
 ;   fsd fs11,-88(sp)
 ;   add sp,-112
 ; block0:
 ;   fsd fa0,0(nominal_sp)
 ;   fsd fa1,8(nominal_sp)
 ;   vle8.v v28,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v29,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v30,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v31,64(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v0,80(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v1,96(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v2,112(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v3,128(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v5,144(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v7,160(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v4,176(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v6,192(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v25,208(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v27,224(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v9,240(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v19,256(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v21,272(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v23,288(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v26,304(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v8,320(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v18,336(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v20,352(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v22,368(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v24,384(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v11,400(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vle8.v v10,416(fp) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v24,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v22,16(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v20,32(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v18,48(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v8,64(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v26,80(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v23,96(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v21,112(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v19,128(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v9,144(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v27,160(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v25,176(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v6,192(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v4,208(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v7,224(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v5,240(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v3,256(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v2,272(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v1,288(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v0,304(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v31,320(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v30,336(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v29,352(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v28,368(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v17,384(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v16,400(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v15,416(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v14,432(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v13,448(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vse8.v v12,464(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   fld fa4,8(nominal_sp)
 ;   vse8.v v14,480(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   fld fa7,0(nominal_sp)
 ;   vse8.v v17,496(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   add sp,+112
 ;   fld fs0,-8(sp)
 ;   fld fs2,-16(sp)
 ;   fld fs3,-24(sp)
 ;   fld fs4,-32(sp)
 ;   fld fs5,-40(sp)
 ;   fld fs6,-48(sp)
 ;   fld fs7,-56(sp)
 ;   fld fs8,-64(sp)
 ;   fld fs9,-72(sp)
 ;   fld fs10,-80(sp)
 ;   fld fs11,-88(sp)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
 ;   sd ra, 8(sp)
 ;   sd s0, 0(sp)
 ;   ori s0, sp, 0
 ;   fsd fs0, -8(sp)
 ;   fsd fs2, -0x10(sp)
 ;   fsd fs3, -0x18(sp)
 ;   fsd fs4, -0x20(sp)
 ;   fsd fs5, -0x28(sp)
 ;   fsd fs6, -0x30(sp)
 ;   fsd fs7, -0x38(sp)
 ;   fsd fs8, -0x40(sp)
 ;   fsd fs9, -0x48(sp)
 ;   fsd fs10, -0x50(sp)
 ;   fsd fs11, -0x58(sp)
 ;   addi sp, sp, -0x70
 ; block1: ; offset 0x40
 ;   fsd fa0, 0(sp)
 ;   fsd fa1, 8(sp)
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x10, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8e, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x20, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8e, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x30, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8f, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x40, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8f, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x50, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x80, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x60, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x80, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x70, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x81, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x80, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x81, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x90, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x82, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xa0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x83, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xb0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x82, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xc0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x83, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xd0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8c, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xe0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8d, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xf0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x84, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x89, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x10, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8a, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x20, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x8b, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x30, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8d, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x40, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x84, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x50, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x89, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x60, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8a, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x70, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8b, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x80, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x8c, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x90, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x87, 0x85, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xa0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, s0
 ;   .byte 0x07, 0x85, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8c, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x10, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8b, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x20, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8a, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x30, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x89, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x40, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x84, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x50, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8d, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x60, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8b, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x70, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8a, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x80, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x89, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x90, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x84, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xa0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8d, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xb0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8c, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xc0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x83, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xd0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x82, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xe0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x83, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xf0, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x82, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x81, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x10, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x81, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x20, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x80, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x30, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x80, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x40, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8f, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x50, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8f, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x60, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x8e, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x70, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x8e, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x80, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x88, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x90, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x88, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xa0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x87, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xb0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x87, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xc0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x86, 0x0f, 0x02
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xd0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x86, 0x0f, 0x02
 ;   fld fa4, 8(sp)
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xe0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x87, 0x0f, 0x02
 ;   fld fa7, 0(sp)
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0xf0, 0x01, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0xa7, 0x88, 0x0f, 0x02
 ;   addi sp, sp, 0x70
 ;   fld fs0, -8(sp)
 ;   fld fs2, -0x10(sp)
 ;   fld fs3, -0x18(sp)
 ;   fld fs4, -0x20(sp)
 ;   fld fs5, -0x28(sp)
 ;   fld fs6, -0x30(sp)
 ;   fld fs7, -0x38(sp)
 ;   fld fs8, -0x40(sp)
 ;   fld fs9, -0x48(sp)
 ;   fld fs10, -0x50(sp)
 ;   fld fs11, -0x58(sp)
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
 ;   ret
--- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
@@ -0,0 +1,73 @@
 test compile precise-output
 set unwind_info=false
 target riscv64 has_v
 function %iadd_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
    v2 = iadd v0, v1
    return v2
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v10,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   .byte 0x57, 0x05, 0xb5, 0x02
 ;   ret
 function %iadd_i16x8(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
    v2 = iadd v0, v1
    return v2
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v10,v11,v10 #avl=8, #vtype=(e16, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x84, 0xcc
 ;   .byte 0x57, 0x05, 0xb5, 0x02
 ;   ret
 function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = iadd v0, v1
    return v2
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v10,v11,v10 #avl=4, #vtype=(e32, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x02, 0xcd
 ;   .byte 0x57, 0x05, 0xb5, 0x02
 ;   ret
 function %iadd_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
    v2 = iadd v0, v1
    return v2
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v10,v11,v10 #avl=2, #vtype=(e64, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x81, 0xcd
 ;   .byte 0x57, 0x05, 0xb5, 0x02
 ;   ret
--- a/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif
@@ -0,0 +1,97 @@
 test compile precise-output
 set unwind_info=false
 target riscv64 has_v
 function %load_i8x16(i64) -> i8x16 {
 block0(v0: i64):
    v1 = load.i8x16 v0
    return v1
 }
 ; VCode:
 ; block0:
 ;   vle8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x07, 0x85, 0x0f, 0x02
 ;   ret
 function %load_i16x8(i64) -> i16x8 {
 block0(v0: i64):
    v1 = load.i16x8 v0
    return v1
 }
 ; VCode:
 ; block0:
 ;   vle16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x84, 0xcc
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x07, 0xd5, 0x0f, 0x02
 ;   ret
 function %load_i32x4(i64) -> i32x4 {
 block0(v0: i64):
    v1 = load.i32x4 v0
    return v1
 }
 ; VCode:
 ; block0:
 ;   vle32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x02, 0xcd
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x07, 0xe5, 0x0f, 0x02
 ;   ret
 function %load_i64x2(i64) -> i64x2 {
 block0(v0: i64):
    v1 = load.i64x2 v0
    return v1
 }
 ; VCode:
 ; block0:
 ;   vle64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x81, 0xcd
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x07, 0xf5, 0x0f, 0x02
 ;   ret
--- a/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif
@@ -0,0 +1,97 @@
 test compile precise-output
 set unwind_info=false
 target riscv64 has_v
 function %store_i8x16(i64, i8x16) {
 block0(v0: i64, v1: i8x16):
    store.i8x16 v1, v0
    return
 }
 ; VCode:
 ; block0:
 ;   vse8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0x85, 0x0f, 0x02
 ;   ret
 function %store_i16x8(i64, i16x8) {
 block0(v0: i64, v1: i16x8):
    store.i16x8 v1, v0
    return
 }
 ; VCode:
 ; block0:
 ;   vse16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x84, 0xcc
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0xd5, 0x0f, 0x02
 ;   ret
 function %store_i32x4(i64, i32x4) {
 block0(v0: i64, v1: i32x4):
    store.i32x4 v1, v0
    return
 }
 ; VCode:
 ; block0:
 ;   vse32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x02, 0xcd
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0xe5, 0x0f, 0x02
 ;   ret
 function %store_i64x2(i64, i64x2) {
 block0(v0: i64, v1: i64x2):
    store.i64x2 v1, v0
    return
 }
 ; VCode:
 ; block0:
 ;   vse64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x81, 0xcd
 ;   auipc t6, 0
 ;   ld t6, 0xc(t6)
 ;   j 0xc
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   add t6, t6, a0
 ;   .byte 0x27, 0xf5, 0x0f, 0x02
 ;   ret
--- a/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif
@@ -0,0 +1,68 @@
 test compile precise-output
 set unwind_info=false
 target riscv64 has_v
 ;; Interleaves vector operations to ensure that `vsetivli` is emitted
 function %iadd_multi(i8x16, i16x8) -> i8x16, i16x8  {
 block0(v0: i8x16, v1: i16x8):
    v4 = iadd v0, v0
    v5 = iadd v1, v1
    v6 = iadd v5, v5
    return v4, v6
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v10,v10,v10 #avl=16, #vtype=(e8, m1, ta, ma)
 ;   vadd.vv v5,v11,v11 #avl=8, #vtype=(e16, m1, ta, ma)
 ;   vadd.vv v11,v5,v5 #avl=8, #vtype=(e16, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   .byte 0x57, 0x05, 0xa5, 0x02
 ;   .byte 0x57, 0x70, 0x84, 0xcc
 ;   .byte 0xd7, 0x82, 0xb5, 0x02
 ;   .byte 0xd7, 0x85, 0x52, 0x02
 ;   ret
 ;; When the block changes, we need to reemit the vector state instruction
 ;; Even if vtype is the same.
 function %(i8x16, i8x16) -> i8x16  {
 block0(v0: i8x16, v1: i8x16):
    v2 = iadd v0, v1
    jump block1(v1, v2)
 block1(v3: i8x16, v4: i8x16):
    v5 = iadd v3, v4
    jump block2(v4, v5)
 block2(v6: i8x16, v7: i8x16):
    v8 = iadd v6, v7
    return v8
 }
 ; VCode:
 ; block0:
 ;   vadd.vv v5,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma)
 ;   j label1
 ; block1:
 ;   vadd.vv v6,v5,v11 #avl=16, #vtype=(e8, m1, ta, ma)
 ;   j label2
 ; block2:
 ;   vadd.vv v10,v6,v5 #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   .byte 0xd7, 0x02, 0xb5, 0x02
 ; block1: ; offset 0x8
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   .byte 0x57, 0x83, 0x55, 0x02
 ; block2: ; offset 0x10
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   .byte 0x57, 0x85, 0x62, 0x02
 ;   ret
--- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
@@ -6,21 +6,6 @@ set enable_simd
 target x86_64
 target x86_64 skylake
 function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0:i32x4, v1:i32x4):
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5]
 function %iadd_i8x16_with_overflow() -> i8x16 {
 block0:
    v0 = vconst.i8x16 [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255]
    v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 function %isub_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
--- a/cranelift/filetests/filetests/runtests/simd-iadd.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iadd.clif
@@ -0,0 +1,44 @@
 test interpret
 test run
 target aarch64
 target s390x
 set enable_simd
 target x86_64
 target x86_64 skylake
 target riscv64 has_v
 function %iadd_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0:i8x16, v1:i8x16):
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i8x16([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17]
 ; run: %iadd_i8x16([2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 function %iadd_i16x8(i16x8, i16x8) -> i16x8 {
 block0(v0:i16x8, v1:i16x8):
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i16x8([1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8]) == [2 3 4 5 6 7 8 9]
 ; run: %iadd_i16x8([2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1]
 function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0:i32x4, v1:i32x4):
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5]
 ; run: %iadd_i32x4([2 2 2 2], [-1 -1 -1 -1]) == [1 1 1 1]
 function %iadd_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0:i64x2, v1:i64x2):
    v2 = iadd v0, v1
    return v2
 }
 ; run: %iadd_i64x2([1 1], [1 2]) == [2 3]
 ; run: %iadd_i64x2([2 2], [-1 -1]) == [1 1]