riscv64: Initial SIMD Vector Implementation (#6240)

* riscv64: Remove unused code * riscv64: Add vector types * riscv64: Initial Vector ABI Load/Stores * riscv64: Vector Loads/Stores * riscv64: Fix `vsetvli` encoding error * riscv64: Add SIMD `iadd` runtests * riscv64: Rename `VecSew` The SEW name is correct, but only for VType. We also use this type in loads/stores as the Efective Element Width, so the name isn't quite correct in that case. * ci: Add V extension to RISC-V QEMU * riscv64: Misc Cleanups * riscv64: Check V extension in `load`/`store` for SIMD * riscv64: Fix `sumop` doc comment * cranelift: Fix comment typo * riscv64: Add convert for VType and VecElementWidth * riscv64: Remove VecElementWidth converter
2023-04-20 22:54:43 +01:00
parent 7ad2fe32c9
commit 60e4a00413
22 changed files with 1945 additions and 137 deletions
--- a/ci/build-test-matrix.js
+++ b/ci/build-test-matrix.js
@@ -82,7 +82,7 @@ const array = [
    "target": "riscv64gc-unknown-linux-gnu",
    "gcc_package": "gcc-riscv64-linux-gnu",
    "gcc": "riscv64-linux-gnu-gcc",
-    "qemu": "qemu-riscv64 -cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu",
+    "qemu": "qemu-riscv64 -cpu rv64,v=true,vlen=256,vext_spec=v1.0,zba=true,zbb=true,zbc=true,zbs=true,zbkb=true -L /usr/riscv64-linux-gnu",
    "qemu_target": "riscv64-linux-user",
    "name": "Test Linux riscv64",
    "filter": "linux-riscv64",
--- a/cranelift/codegen/build.rs
+++ b/cranelift/codegen/build.rs
@@ -274,6 +274,7 @@ fn get_isle_compilations(
                    prelude_isle.clone(),
                    prelude_lower_isle.clone(),
                    src_isa_risc_v.join("inst.isle"),
+                    src_isa_risc_v.join("inst_vector.isle"),
                    src_isa_risc_v.join("lower.isle"),
                ],
                untracked_inputs: vec![clif_lower_isle.clone()],
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -329,6 +329,31 @@
      (guard_size u32)
      (probe_count u32)
      (tmp WritableReg))
+
+    (VecAluRRR
+      (op VecAluOpRRR)
+      (vd WritableReg)
+      (vs1 Reg)
+      (vs2 Reg)
+      (vstate VState))
+
+    (VecSetState
+      (rd WritableReg)
+      (vstate VState))
+
+    (VecLoad
+      (eew VecElementWidth)
+      (to WritableReg)
+      (from VecAMode)
+      (flags MemFlags)
+      (vstate VState))
+
+    (VecStore
+      (eew VecElementWidth)
+      (to VecAMode)
+      (from Reg)
+      (flags MemFlags)
+      (vstate VState))
 ))


@@ -711,6 +736,9 @@

 ;; ISA Extension helpers

+(decl pure has_v () bool)
+(extern constructor has_v has_v)
+
 (decl pure has_zbkb () bool)
 (extern constructor has_zbkb has_zbkb)

--- a/cranelift/codegen/src/isa/riscv64/inst/args.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/args.rs
@@ -1668,55 +1668,6 @@ impl CsrAddress {
    }
 }

-pub(crate) struct VType {
-    vma: bool,
-    vta: bool,
-    vsew: Vsew,
-    valmul: Vlmul,
-}
-
-impl VType {
-    fn as_u32(self) -> u32 {
-        self.valmul.as_u32()
-            | self.vsew.as_u32() << 3
-            | if self.vta { 1 << 7 } else { 0 }
-            | if self.vma { 1 << 8 } else { 0 }
-    }
-
-    const fn vill_bit() -> u64 {
-        1 << 63
-    }
-}
-
-enum Vlmul {
-    vlmul_1_div_8 = 0b101,
-    vlmul_1_div_4 = 0b110,
-    vlmul_1_div_2 = 0b111,
-    vlmul_1 = 0b000,
-    vlmul_2 = 0b001,
-    vlmul_4 = 0b010,
-    vlmul_8 = 0b011,
-}
-
-impl Vlmul {
-    fn as_u32(self) -> u32 {
-        self as u32
-    }
-}
-
-enum Vsew {
-    sew_8 = 0b000,
-    sew_16 = 0b001,
-    sew_32 = 0b010,
-    sew_64 = 0b011,
-}
-
-impl Vsew {
-    fn as_u32(self) -> u32 {
-        self as u32
-    }
-}
-
 impl CsrOP {
    pub(crate) fn op_name(self) -> &'static str {
        match self {
@@ -1754,40 +1705,11 @@ impl CsrOP {
        if self.need_rs() {
            reg_to_gpr_num(rs.unwrap())
        } else {
-            zimm.unwrap().as_u32()
+            zimm.unwrap().bits()
        }
    }
 }

-enum Vxrm {
-    // round-to-nearest-up (add +0.5 LSB)
-    rnu = 0b00,
-    // round-to-nearest-even
-    rne = 0b01,
-    //round-down (truncate)
-    rdn = 0b10,
-    // round-to-odd (OR bits into LSB, aka "jam")
-    rod = 0b11,
-}
-
-impl Vxrm {
-    pub(crate) fn as_u32(self) -> u32 {
-        self as u32
-    }
-}
-
-pub(crate) struct Vcsr {
-    xvrm: Vxrm,
-    // Fixed-point accrued saturation flag
-    vxsat: bool,
-}
-
-impl Vcsr {
-    pub(crate) fn as_u32(self) -> u32 {
-        return if self.vxsat { 1 } else { 0 } | self.xvrm.as_u32();
-    }
-}
-
 ///Atomic Memory ordering.
 #[derive(Copy, Clone, Debug)]
 pub enum AMO {
--- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
@@ -102,6 +102,13 @@ pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 {
    u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
 }

+#[derive(Clone, Debug, PartialEq, Default)]
+pub enum EmitVState {
+    #[default]
+    Unknown,
+    Known(VState),
+}
+
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -114,6 +121,9 @@ pub struct EmitState {
    /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
    /// optimized away at compiletime. See [cranelift_control].
    ctrl_plane: ControlPlane,
+    /// Vector State
+    /// Controls the current state of the vector unit at the emission point.
+    vstate: EmitVState,
 }

 impl EmitState {
@@ -141,6 +151,7 @@ impl MachInstEmitState<Inst> for EmitState {
            stack_map: None,
            cur_srcloc: RelSourceLoc::default(),
            ctrl_plane,
+            vstate: EmitVState::Unknown,
        }
    }

@@ -159,6 +170,11 @@ impl MachInstEmitState<Inst> for EmitState {
    fn take_ctrl_plane(self) -> ControlPlane {
        self.ctrl_plane
    }
+
+    fn on_new_block(&mut self) {
+        // Reset the vector state.
+        self.vstate = EmitVState::Unknown;
+    }
 }

 impl Inst {
@@ -386,6 +402,80 @@ impl Inst {
        }
        insts
    }
+
+    /// Returns Some(VState) if this insturction is expecting a specific vector state
+    /// before emission.
+    fn expected_vstate(&self) -> Option<&VState> {
+        match self {
+            Inst::Nop0
+            | Inst::Nop4
+            | Inst::BrTable { .. }
+            | Inst::Auipc { .. }
+            | Inst::Lui { .. }
+            | Inst::LoadConst32 { .. }
+            | Inst::LoadConst64 { .. }
+            | Inst::AluRRR { .. }
+            | Inst::FpuRRR { .. }
+            | Inst::AluRRImm12 { .. }
+            | Inst::Load { .. }
+            | Inst::Store { .. }
+            | Inst::Args { .. }
+            | Inst::Ret { .. }
+            | Inst::Extend { .. }
+            | Inst::AjustSp { .. }
+            | Inst::Call { .. }
+            | Inst::CallInd { .. }
+            | Inst::TrapIf { .. }
+            | Inst::Jal { .. }
+            | Inst::CondBr { .. }
+            | Inst::LoadExtName { .. }
+            | Inst::LoadAddr { .. }
+            | Inst::VirtualSPOffsetAdj { .. }
+            | Inst::Mov { .. }
+            | Inst::MovFromPReg { .. }
+            | Inst::Fence { .. }
+            | Inst::FenceI
+            | Inst::ECall
+            | Inst::EBreak
+            | Inst::Udf { .. }
+            | Inst::FpuRR { .. }
+            | Inst::FpuRRRR { .. }
+            | Inst::Jalr { .. }
+            | Inst::Atomic { .. }
+            | Inst::Select { .. }
+            | Inst::AtomicCas { .. }
+            | Inst::IntSelect { .. }
+            | Inst::Csr { .. }
+            | Inst::Icmp { .. }
+            | Inst::SelectReg { .. }
+            | Inst::FcvtToInt { .. }
+            | Inst::RawData { .. }
+            | Inst::AtomicStore { .. }
+            | Inst::AtomicLoad { .. }
+            | Inst::AtomicRmwLoop { .. }
+            | Inst::TrapIfC { .. }
+            | Inst::Unwind { .. }
+            | Inst::DummyUse { .. }
+            | Inst::FloatRound { .. }
+            | Inst::FloatSelect { .. }
+            | Inst::FloatSelectPseudo { .. }
+            | Inst::Popcnt { .. }
+            | Inst::Rev8 { .. }
+            | Inst::Cltz { .. }
+            | Inst::Brev8 { .. }
+            | Inst::StackProbeLoop { .. } => None,
+            // VecSetState does not expect any vstate, rather it updates it.
+            Inst::VecSetState { .. } => None,
+
+            Inst::VecAluRRR { vstate, .. } |
+            // TODO: Unit-stride loads and stores only need the AVL to be correct, not
+            // the full vtype. A future optimization could be to decouple these two when
+            // updating vstate. This would allow us to avoid emitting a VecSetState in
+            // some cases.
+            Inst::VecLoad { vstate, .. }
+            | Inst::VecStore { vstate, .. } => Some(vstate),
+        }
+    }
 }

 impl MachInstEmit for Inst {
@@ -400,6 +490,19 @@ impl MachInstEmit for Inst {
        state: &mut EmitState,
    ) {
        let mut allocs = AllocationConsumer::new(allocs);
+
+        // Check if we need to update the vector state before emitting this instruction
+        if let Some(expected) = self.expected_vstate() {
+            if state.vstate != EmitVState::Known(expected.clone()) {
+                // Update the vector state.
+                Inst::VecSetState {
+                    rd: writable_zero_reg(),
+                    vstate: expected.clone(),
+                }
+                .emit(&[], sink, emit_info, state);
+            }
+        }
+
        // N.B.: we *must* not exceed the "worst-case size" used to compute
        // where to insert islands, except when islands are explicitly triggered
        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
@@ -530,13 +633,14 @@ impl MachInstEmit for Inst {
                    (rs1, rs2)
                };

-                let x: u32 = alu_op.op_code()
-                    | reg_to_gpr_num(rd.to_reg()) << 7
-                    | (alu_op.funct3()) << 12
-                    | reg_to_gpr_num(rs1) << 15
-                    | reg_to_gpr_num(rs2) << 20
-                    | alu_op.funct7() << 25;
-                sink.put4(x);
+                sink.put4(encode_r_type(
+                    alu_op.op_code(),
+                    rd.to_reg(),
+                    alu_op.funct3(),
+                    rs1,
+                    rs2,
+                    alu_op.funct7(),
+                ));
            }
            &Inst::AluRRImm12 {
                alu_op,
@@ -2695,6 +2799,120 @@ impl MachInstEmit for Inst {
                .emit(&[], sink, emit_info, state);
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
+            &Inst::VecAluRRR {
+                op, vd, vs1, vs2, ..
+            } => {
+                let vs1 = allocs.next(vs1);
+                let vs2 = allocs.next(vs2);
+                let vd = allocs.next_writable(vd);
+
+                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
+                // masking disabled.
+                let vm = 1;
+
+                sink.put4(encode_valu(
+                    op.opcode(),
+                    vd.to_reg(),
+                    op.funct3(),
+                    vs1,
+                    vs2,
+                    vm,
+                    op.funct6(),
+                ));
+            }
+            &Inst::VecSetState { rd, ref vstate } => {
+                let rd = allocs.next_writable(rd);
+
+                sink.put4(encode_vcfg_imm(
+                    0x57,
+                    rd.to_reg(),
+                    vstate.avl.unwrap_static(),
+                    &vstate.vtype,
+                ));
+
+                // Update the current vector emit state.
+                state.vstate = EmitVState::Known(vstate.clone());
+            }
+
+            &Inst::VecLoad {
+                eew,
+                to,
+                ref from,
+                flags,
+                ..
+            } => {
+                let offset = from.get_offset_with_state(state);
+                let from_reg = allocs.next(from.get_base_register());
+                let to = allocs.next_writable(to);
+
+                // Vector Loads don't support immediate offsets, so we need to load it into a register.
+                let addr = writable_spilltmp_reg();
+                LoadConstant::U64(offset as u64)
+                    .load_constant_and_add(addr, from_reg)
+                    .into_iter()
+                    .for_each(|inst| inst.emit(&[], sink, emit_info, state));
+
+                let srcloc = state.cur_srcloc();
+                if !srcloc.is_default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(TrapCode::HeapOutOfBounds);
+                }
+
+                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
+                // masking disabled.
+                let vm = 1;
+
+                sink.put4(encode_vmem_load(
+                    0x07,
+                    to.to_reg(),
+                    eew,
+                    addr.to_reg(),
+                    from.lumop(),
+                    vm,
+                    from.mop(),
+                    from.nf(),
+                ));
+            }
+
+            &Inst::VecStore {
+                eew,
+                ref to,
+                from,
+                flags,
+                ..
+            } => {
+                let offset = to.get_offset_with_state(state);
+                let to_reg = allocs.next(to.get_base_register());
+                let from = allocs.next(from);
+
+                // Vector Stores don't support immediate offsets, so we need to load it into a register.
+                let addr = writable_spilltmp_reg();
+                LoadConstant::U64(offset as u64)
+                    .load_constant_and_add(addr, to_reg)
+                    .into_iter()
+                    .for_each(|inst| inst.emit(&[], sink, emit_info, state));
+
+                let srcloc = state.cur_srcloc();
+                if !srcloc.is_default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(TrapCode::HeapOutOfBounds);
+                }
+
+                // This is the mask bit, we don't yet implement masking, so set it to 1, which means
+                // masking disabled.
+                let vm = 1;
+
+                sink.put4(encode_vmem_store(
+                    0x27,
+                    from,
+                    eew,
+                    addr.to_reg(),
+                    to.sumop(),
+                    vm,
+                    to.mop(),
+                    to.nf(),
+                ));
+            }
        };
        let end_off = sink.cur_offset();
        assert!(
--- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs
@@ -0,0 +1,128 @@
+//! Contains the RISC-V instruction encoding logic.
+//!
+//! These formats are specified in the RISC-V specification in section 2.2.
+//! See: https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+//!
+//! Some instructions especially in extensions have slight variations from
+//! the base RISC-V specification.
+
+use super::{UImm5, VType};
+use crate::isa::riscv64::inst::reg_to_gpr_num;
+use crate::isa::riscv64::lower::isle::generated_code::VecElementWidth;
+use crate::Reg;
+
+/// Encode an R-type instruction.
+///
+/// Layout:
+/// 0-------6-7-------11-12------14-15------19-20------24-25-------31
+/// | Opcode |   rd     |  funct3  |   rs1    |   rs2    |   funct7  |
+pub fn encode_r_type(opcode: u32, rd: Reg, funct3: u32, rs1: Reg, rs2: Reg, funct7: u32) -> u32 {
+    let mut bits = 0;
+    bits |= opcode & 0b1111111;
+    bits |= reg_to_gpr_num(rd) << 7;
+    bits |= (funct3 & 0b111) << 12;
+    bits |= reg_to_gpr_num(rs1) << 15;
+    bits |= reg_to_gpr_num(rs2) << 20;
+    bits |= (funct7 & 0b1111111) << 25;
+    bits
+}
+
+/// Encodes a Vector ALU instruction.
+///
+/// Fields:
+/// - opcode (7 bits)
+/// - vd     (5 bits)
+/// - funct3 (3 bits)
+/// - vs1    (5 bits)
+/// - vs2    (5 bits)
+/// - vm     (1 bit)
+/// - funct6 (6 bits)
+///
+/// See: https://github.com/riscv/riscv-v-spec/blob/master/valu-format.adoc
+pub fn encode_valu(
+    opcode: u32,
+    vd: Reg,
+    funct3: u32,
+    vs1: Reg,
+    vs2: Reg,
+    vm: u32,
+    funct6: u32,
+) -> u32 {
+    let funct6 = funct6 & 0b111111;
+    let vm = vm & 0b1;
+    let funct7 = (funct6 << 6) | vm;
+    encode_r_type(opcode, vd, funct3, vs1, vs2, funct7)
+}
+
+/// Encodes a Vector CFG Imm instruction.
+///
+/// See: https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc
+// TODO: Check if this is any of the known instruction types in the spec.
+pub fn encode_vcfg_imm(opcode: u32, rd: Reg, imm: UImm5, vtype: &VType) -> u32 {
+    let mut bits = 0;
+    bits |= opcode & 0b1111111;
+    bits |= reg_to_gpr_num(rd) << 7;
+    bits |= 0b111 << 12;
+    bits |= (imm.bits() & 0b11111) << 15;
+    bits |= (vtype.encode() & 0b1111111111) << 20;
+    bits |= 0b11 << 30;
+    bits
+}
+
+/// Encodes a Vector Mem Unit Stride Load instruction.
+///
+/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc
+/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP
+pub fn encode_vmem_load(
+    opcode: u32,
+    vd: Reg,
+    width: VecElementWidth,
+    rs1: Reg,
+    lumop: u32,
+    vm: u32,
+    mop: u32,
+    nf: u32,
+) -> u32 {
+    // Width is encoded differently to avoid a clash with the FP load/store sizes.
+    let width = match width {
+        VecElementWidth::E8 => 0b000,
+        VecElementWidth::E16 => 0b101,
+        VecElementWidth::E32 => 0b110,
+        VecElementWidth::E64 => 0b111,
+    };
+
+    let mut bits = 0;
+    bits |= opcode & 0b1111111;
+    bits |= reg_to_gpr_num(vd) << 7;
+    bits |= width << 12;
+    bits |= reg_to_gpr_num(rs1) << 15;
+    bits |= (lumop & 0b11111) << 20;
+    bits |= (vm & 0b1) << 25;
+    bits |= (mop & 0b11) << 26;
+
+    // The mew bit (inst[28]) when set is expected to be used to encode expanded
+    // memory sizes of 128 bits and above, but these encodings are currently reserved.
+    bits |= 0b0 << 28;
+
+    bits |= (nf & 0b111) << 29;
+    bits
+}
+
+/// Encodes a Vector Mem Unit Stride Load instruction.
+///
+/// See: https://github.com/riscv/riscv-v-spec/blob/master/vmem-format.adoc
+/// TODO: These instructions share opcode space with LOAD-FP and STORE-FP
+pub fn encode_vmem_store(
+    opcode: u32,
+    vs3: Reg,
+    width: VecElementWidth,
+    rs1: Reg,
+    sumop: u32,
+    vm: u32,
+    mop: u32,
+    nf: u32,
+) -> u32 {
+    // This is pretty much the same as the load instruction, just
+    // with different names on the fields.
+    encode_vmem_load(opcode, vs3, width, rs1, sumop, vm, mop, nf)
+}
--- a/cranelift/codegen/src/isa/riscv64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/imms.rs
@@ -99,33 +99,31 @@ impl Display for Imm20 {
    }
 }

-#[derive(Clone, Copy)]
-pub struct Uimm5 {
-    bits: u8,
+/// An unsigned 5-bit immediate.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct UImm5 {
+    value: u8,
 }

-impl Uimm5 {
-    pub fn from_bits(bits: u8) -> Self {
-        Self { bits }
+impl UImm5 {
+    /// Create an unsigned 5-bit immediate from u8.
+    pub fn maybe_from_u8(value: u8) -> Option<UImm5> {
+        if value < 32 {
+            Some(UImm5 { value })
+        } else {
+            None
+        }
    }
-    /// Create a zero immediate of this format.
-    pub fn zero() -> Self {
-        Self { bits: 0 }
-    }
-    pub fn as_u32(&self) -> u32 {
-        (self.bits as u32) & 0b1_1111
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        u32::from(self.value)
    }
 }

-impl Debug for Uimm5 {
+impl Display for UImm5 {
    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        write!(f, "{}", self.bits)
-    }
-}
-
-impl Display for Uimm5 {
-    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        write!(f, "{}", self.bits)
+        write!(f, "{}", self.value)
    }
 }

--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -4,9 +4,10 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]

+use super::lower::isle::generated_code::{VecAMode, VecElementWidth};
 use crate::binemit::{Addend, CodeOffset, Reloc};
 pub use crate::ir::condcodes::IntCC;
-use crate::ir::types::{F32, F64, I128, I16, I32, I64, I8, R32, R64};
+use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, R32, R64};

 pub use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, Type, ValueLabel};
 use crate::isa::CallConv;
@@ -29,6 +30,10 @@ pub mod args;
 pub use self::args::*;
 pub mod emit;
 pub use self::emit::*;
+pub mod vector;
+pub use self::vector::*;
+pub mod encode;
+pub use self::encode::*;
 pub mod unwind;

 use crate::isa::riscv64::abi::Riscv64MachineDeps;
@@ -41,7 +46,7 @@ use std::fmt::{Display, Formatter};
 pub(crate) type OptionReg = Option<Reg>;
 pub(crate) type OptionImm12 = Option<Imm12>;
 pub(crate) type VecBranchTarget = Vec<BranchTarget>;
-pub(crate) type OptionUimm5 = Option<Uimm5>;
+pub(crate) type OptionUimm5 = Option<UImm5>;
 pub(crate) type OptionFloatRoundingMode = Option<FRM>;
 pub(crate) type VecU8 = Vec<u8>;
 pub(crate) type VecWritableReg = Vec<Writable<Reg>>;
@@ -313,21 +318,41 @@ impl Inst {

    /// Generic constructor for a load (zero-extending where appropriate).
    pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type, flags: MemFlags) -> Inst {
-        Inst::Load {
-            rd: into_reg,
-            op: LoadOP::from_type(ty),
-            from: mem,
-            flags,
+        if ty.is_vector() {
+            Inst::VecLoad {
+                eew: VecElementWidth::from_type(ty),
+                to: into_reg,
+                from: VecAMode::UnitStride { base: mem },
+                flags,
+                vstate: VState::from_type(ty),
+            }
+        } else {
+            Inst::Load {
+                rd: into_reg,
+                op: LoadOP::from_type(ty),
+                from: mem,
+                flags,
+            }
        }
    }

    /// Generic constructor for a store.
    pub fn gen_store(mem: AMode, from_reg: Reg, ty: Type, flags: MemFlags) -> Inst {
-        Inst::Store {
-            src: from_reg,
-            op: StoreOP::from_type(ty),
-            to: mem,
-            flags,
+        if ty.is_vector() {
+            Inst::VecStore {
+                eew: VecElementWidth::from_type(ty),
+                to: VecAMode::UnitStride { base: mem },
+                from: from_reg,
+                flags,
+                vstate: VState::from_type(ty),
+            }
+        } else {
+            Inst::Store {
+                src: from_reg,
+                op: StoreOP::from_type(ty),
+                to: mem,
+                flags,
+            }
        }
    }
 }
@@ -623,6 +648,22 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            // gen_prologue is called at emit stage.
            // no need let reg alloc know.
        }
+        &Inst::VecAluRRR { vd, vs1, vs2, .. } => {
+            collector.reg_use(vs1);
+            collector.reg_use(vs2);
+            collector.reg_def(vd);
+        }
+        &Inst::VecSetState { rd, .. } => {
+            collector.reg_def(rd);
+        }
+        &Inst::VecLoad { to, ref from, .. } => {
+            collector.reg_use(from.get_base_register());
+            collector.reg_def(to);
+        }
+        &Inst::VecStore { ref to, from, .. } => {
+            collector.reg_use(to.get_base_register());
+            collector.reg_use(from);
+        }
    }
 }

@@ -727,6 +768,7 @@ impl MachInst for Inst {
            F32 => Ok((&[RegClass::Float], &[F32])),
            F64 => Ok((&[RegClass::Float], &[F64])),
            I128 => Ok((&[RegClass::Int, RegClass::Int], &[I64, I64])),
+            _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])),
            _ => Err(CodegenError::Unsupported(format!(
                "Unexpected SSA-value type: {}",
                ty
@@ -784,6 +826,17 @@ pub fn reg_name(reg: Reg) -> String {
        }
    }
 }
+pub fn vec_reg_name(reg: Reg) -> String {
+    match reg.to_real_reg() {
+        Some(real) => {
+            assert_eq!(real.class(), RegClass::Float);
+            format!("v{}", real.hw_enc())
+        }
+        None => {
+            format!("{:?}", reg)
+        }
+    }
+}

 impl Inst {
    fn print_with_state(
@@ -795,6 +848,16 @@ impl Inst {
            let reg = allocs.next(reg);
            reg_name(reg)
        };
+        let format_vec_reg = |reg: Reg, allocs: &mut AllocationConsumer<'_>| -> String {
+            let reg = allocs.next(reg);
+            vec_reg_name(reg)
+        };
+
+        let format_vec_amode = |amode: &VecAMode, allocs: &mut AllocationConsumer<'_>| -> String {
+            match amode {
+                VecAMode::UnitStride { base } => base.to_string_with_alloc(allocs),
+            }
+        };

        let format_regs = |regs: &[Reg], allocs: &mut AllocationConsumer<'_>| -> String {
            let mut x = if regs.len() > 1 {
@@ -839,6 +902,7 @@ impl Inst {
                "".into()
            }
        }
+
        match self {
            &Inst::Nop0 => {
                format!("##zero length nop")
@@ -1501,6 +1565,48 @@ impl Inst {
            &MInst::Udf { trap_code } => format!("udf##trap_code={}", trap_code),
            &MInst::EBreak {} => String::from("ebreak"),
            &MInst::ECall {} => String::from("ecall"),
+            &Inst::VecAluRRR {
+                op,
+                vd,
+                vs1,
+                vs2,
+                ref vstate,
+            } => {
+                let vs1_s = format_vec_reg(vs1, allocs);
+                let vs2_s = format_vec_reg(vs2, allocs);
+                let vd_s = format_vec_reg(vd.to_reg(), allocs);
+
+                // Note: vs2 and vs1 here are opposite to the standard scalar ordering.
+                // This is noted in Section 10.1 of the RISC-V Vector spec.
+                format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate)
+            }
+            &Inst::VecSetState { rd, ref vstate } => {
+                let rd_s = format_reg(rd.to_reg(), allocs);
+                assert!(vstate.avl.is_static());
+                format!("vsetivli {}, {}, {}", rd_s, vstate.avl, vstate.vtype)
+            }
+            Inst::VecLoad {
+                eew,
+                to,
+                from,
+                ref vstate,
+                ..
+            } => {
+                let base = format_vec_amode(from, allocs);
+                let vd = format_vec_reg(to.to_reg(), allocs);
+                format!("vl{}.v {},{} {}", eew, vd, base, vstate)
+            }
+            Inst::VecStore {
+                eew,
+                to,
+                from,
+                ref vstate,
+                ..
+            } => {
+                let dst = format_vec_amode(to, allocs);
+                let vs3 = format_vec_reg(*from, allocs);
+                format!("vs{}.v {},{} {}", eew, vs3, dst, vstate)
+            }
        }
    }
 }
--- a/cranelift/codegen/src/isa/riscv64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/regs.rs
@@ -218,3 +218,13 @@ pub(crate) fn x_reg_range(start: usize, end: usize) -> Vec<Writable<Reg>> {
    }
    regs
 }
+
+#[inline]
+pub fn v_reg(enc: usize) -> Reg {
+    let p_reg = PReg::new(enc, RegClass::Float);
+    let v_reg = VReg::new(p_reg.index(), p_reg.class());
+    Reg::from(v_reg)
+}
+pub fn vx_reg(enc: usize) -> PReg {
+    PReg::new(enc, RegClass::Float)
+}
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -0,0 +1,289 @@
+use crate::isa::riscv64::inst::EmitState;
+use crate::isa::riscv64::lower::isle::generated_code::{
+    VecAMode, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode, VecTailMode,
+};
+use crate::Reg;
+use core::fmt;
+
+use super::{Type, UImm5};
+
+impl VecAvl {
+    pub fn _static(size: u32) -> Self {
+        VecAvl::Static {
+            size: UImm5::maybe_from_u8(size as u8).expect("Invalid size for AVL"),
+        }
+    }
+
+    pub fn is_static(&self) -> bool {
+        match self {
+            VecAvl::Static { .. } => true,
+        }
+    }
+
+    pub fn unwrap_static(&self) -> UImm5 {
+        match self {
+            VecAvl::Static { size } => *size,
+        }
+    }
+}
+
+// TODO: Can we tell ISLE to derive this?
+impl PartialEq for VecAvl {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (VecAvl::Static { size: lhs }, VecAvl::Static { size: rhs }) => lhs == rhs,
+        }
+    }
+}
+
+impl fmt::Display for VecAvl {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VecAvl::Static { size } => write!(f, "{}", size),
+        }
+    }
+}
+
+impl VecElementWidth {
+    pub fn from_type(ty: Type) -> Self {
+        Self::from_bits(ty.lane_bits())
+    }
+
+    pub fn from_bits(bits: u32) -> Self {
+        match bits {
+            8 => VecElementWidth::E8,
+            16 => VecElementWidth::E16,
+            32 => VecElementWidth::E32,
+            64 => VecElementWidth::E64,
+            _ => panic!("Invalid number of bits for VecElementWidth: {}", bits),
+        }
+    }
+
+    pub fn bits(&self) -> u32 {
+        match self {
+            VecElementWidth::E8 => 8,
+            VecElementWidth::E16 => 16,
+            VecElementWidth::E32 => 32,
+            VecElementWidth::E64 => 64,
+        }
+    }
+
+    pub fn encode(&self) -> u32 {
+        match self {
+            VecElementWidth::E8 => 0b000,
+            VecElementWidth::E16 => 0b001,
+            VecElementWidth::E32 => 0b010,
+            VecElementWidth::E64 => 0b011,
+        }
+    }
+}
+
+impl fmt::Display for VecElementWidth {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "e{}", self.bits())
+    }
+}
+
+impl VecLmul {
+    pub fn encode(&self) -> u32 {
+        match self {
+            VecLmul::LmulF8 => 0b101,
+            VecLmul::LmulF4 => 0b110,
+            VecLmul::LmulF2 => 0b111,
+            VecLmul::Lmul1 => 0b000,
+            VecLmul::Lmul2 => 0b001,
+            VecLmul::Lmul4 => 0b010,
+            VecLmul::Lmul8 => 0b011,
+        }
+    }
+}
+
+impl fmt::Display for VecLmul {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VecLmul::LmulF8 => write!(f, "mf8"),
+            VecLmul::LmulF4 => write!(f, "mf4"),
+            VecLmul::LmulF2 => write!(f, "mf2"),
+            VecLmul::Lmul1 => write!(f, "m1"),
+            VecLmul::Lmul2 => write!(f, "m2"),
+            VecLmul::Lmul4 => write!(f, "m4"),
+            VecLmul::Lmul8 => write!(f, "m8"),
+        }
+    }
+}
+
+impl VecTailMode {
+    pub fn encode(&self) -> u32 {
+        match self {
+            VecTailMode::Agnostic => 1,
+            VecTailMode::Undisturbed => 0,
+        }
+    }
+}
+
+impl fmt::Display for VecTailMode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VecTailMode::Agnostic => write!(f, "ta"),
+            VecTailMode::Undisturbed => write!(f, "tu"),
+        }
+    }
+}
+
+impl VecMaskMode {
+    pub fn encode(&self) -> u32 {
+        match self {
+            VecMaskMode::Agnostic => 1,
+            VecMaskMode::Undisturbed => 0,
+        }
+    }
+}
+
+impl fmt::Display for VecMaskMode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VecMaskMode::Agnostic => write!(f, "ma"),
+            VecMaskMode::Undisturbed => write!(f, "mu"),
+        }
+    }
+}
+
+/// Vector Type (VType)
+///
+/// vtype provides the default type used to interpret the contents of the vector register file.
+#[derive(Clone, Debug, PartialEq)]
+pub struct VType {
+    pub sew: VecElementWidth,
+    pub lmul: VecLmul,
+    pub tail_mode: VecTailMode,
+    pub mask_mode: VecMaskMode,
+}
+
+impl VType {
+    // https://github.com/riscv/riscv-v-spec/blob/master/vtype-format.adoc
+    pub fn encode(&self) -> u32 {
+        let mut bits = 0;
+        bits |= self.lmul.encode();
+        bits |= self.sew.encode() << 3;
+        bits |= self.tail_mode.encode() << 6;
+        bits |= self.mask_mode.encode() << 7;
+        bits
+    }
+}
+
+impl fmt::Display for VType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "{}, {}, {}, {}",
+            self.sew, self.lmul, self.tail_mode, self.mask_mode
+        )
+    }
+}
+
+/// Vector State (VState)
+///
+/// VState represents the state of the vector unit that each instruction expects before execution.
+/// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is
+/// used by our instruction emission code to ensure that the vector unit is in the correct state.
+#[derive(Clone, Debug, PartialEq)]
+pub struct VState {
+    pub avl: VecAvl,
+    pub vtype: VType,
+}
+
+impl VState {
+    pub fn from_type(ty: Type) -> Self {
+        VState {
+            avl: VecAvl::_static(ty.lane_count()),
+            vtype: VType {
+                sew: VecElementWidth::from_type(ty),
+                lmul: VecLmul::Lmul1,
+                tail_mode: VecTailMode::Agnostic,
+                mask_mode: VecMaskMode::Agnostic,
+            },
+        }
+    }
+}
+
+impl fmt::Display for VState {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "#avl={}, #vtype=({})", self.avl, self.vtype)
+    }
+}
+
+impl VecAluOpRRR {
+    pub fn opcode(&self) -> u32 {
+        match self {
+            VecAluOpRRR::Vadd => 0x57,
+        }
+    }
+    pub fn funct3(&self) -> u32 {
+        match self {
+            VecAluOpRRR::Vadd => 0b000,
+        }
+    }
+    pub fn funct6(&self) -> u32 {
+        match self {
+            VecAluOpRRR::Vadd => 0b000000,
+        }
+    }
+}
+
+impl fmt::Display for VecAluOpRRR {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            VecAluOpRRR::Vadd => write!(f, "vadd.vv"),
+        }
+    }
+}
+
+impl VecAMode {
+    pub fn get_base_register(&self) -> Reg {
+        match self {
+            VecAMode::UnitStride { base, .. } => base.get_base_register(),
+        }
+    }
+
+    pub(crate) fn get_offset_with_state(&self, state: &EmitState) -> i64 {
+        match self {
+            VecAMode::UnitStride { base, .. } => base.get_offset_with_state(state),
+        }
+    }
+
+    /// `mop` field, described in Table 7 of Section 7.2. Vector Load/Store Addressing Modes
+    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
+    pub fn mop(&self) -> u32 {
+        match self {
+            VecAMode::UnitStride { .. } => 0b00,
+        }
+    }
+
+    /// `lumop` field, described in Table 9 of Section 7.2. Vector Load/Store Addressing Modes
+    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
+    pub fn lumop(&self) -> u32 {
+        match self {
+            VecAMode::UnitStride { .. } => 0b00000,
+        }
+    }
+
+    /// `sumop` field, described in Table 10 of Section 7.2. Vector Load/Store Addressing Modes
+    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
+    pub fn sumop(&self) -> u32 {
+        match self {
+            VecAMode::UnitStride { .. } => 0b00000,
+        }
+    }
+
+    /// The `nf[2:0]` field encodes the number of fields in each segment. For regular vector loads and
+    /// stores, nf=0, indicating that a single value is moved between a vector register group and memory
+    /// at each element position. Larger values in the nf field are used to access multiple contiguous
+    /// fields within a segment as described in Section 7.8 Vector Load/Store Segment Instructions.
+    ///
+    /// https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#72-vector-loadstore-addressing-modes
+    pub fn nf(&self) -> u32 {
+        match self {
+            VecAMode::UnitStride { .. } => 0b000,
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -0,0 +1,132 @@
+;; Represents the possible widths of an element when used in an operation.
+(type VecElementWidth (enum
+  (E8)
+  (E16)
+  (E32)
+  (E64)
+))
+
+;; Vector Register Group Multiplier (LMUL)
+;;
+;; The LMUL setting specifies how we should group registers together. LMUL can
+;; also be a fractional value, reducing the number of bits used in a single
+;; vector register. Fractional LMUL is used to increase the number of effective
+;; usable vector register groups when operating on mixed-width values.
+(type VecLmul (enum
+  (LmulF8)
+  (LmulF4)
+  (LmulF2)
+  (Lmul1)
+  (Lmul2)
+  (Lmul4)
+  (Lmul8)
+))
+
+;; Tail Mode
+;;
+;; The tail mode specifies how the tail elements of a vector register are handled.
+(type VecTailMode (enum
+  ;; Tail Agnostic means that the tail elements are left in an undefined state.
+  (Agnostic)
+  ;; Tail Undisturbed means that the tail elements are left in their original values.
+  (Undisturbed)
+))
+
+;; Mask Mode
+;;
+;; The mask mode specifies how the masked elements of a vector register are handled.
+(type VecMaskMode (enum
+  ;; Mask Agnostic means that the masked out elements are left in an undefined state.
+  (Agnostic)
+  ;; Mask Undisturbed means that the masked out elements are left in their original values.
+  (Undisturbed)
+))
+
+;; Application Vector Length (AVL)
+;;
+;; This setting specifies the number of elements that are going to be processed
+;; in a single instruction. Note: We may end up processing fewer elements than
+;; the AVL setting, if they don't fit in a single register.
+(type VecAvl (enum
+  ;; Static AVL emits a `vsetivli` that uses a constant value
+  (Static (size UImm5))
+  ;; TODO: Add a dynamic, register based AVL mode when we are able to properly test it
+))
+
+(type VType (primitive VType))
+(type VState (primitive VState))
+
+;; Register to Register ALU Ops
+(type VecAluOpRRR (enum
+  (Vadd)
+))
+
+
+
+;; Vector Addressing Mode
+(type VecAMode (enum
+  ;; Vector unit-stride operations access elements stored contiguously in memory
+  ;; starting from the base effective address.
+  (UnitStride
+    (base AMode))
+  ;; TODO: Constant Stride
+  ;; TODO: Indexed Operations
+))
+
+
+;; Builds a static VState matching a SIMD type.
+;; The VState is guaranteed to be static with AVL set to the number of lanes.
+;; Element size is set to the size of the type.
+;; LMUL is set to 1.
+;; Tail mode is set to agnostic.
+;; Mask mode is set to agnostic.
+(decl pure vstate_from_type (Type) VState)
+(extern constructor vstate_from_type vstate_from_type)
+(convert Type VState vstate_from_type)
+
+;; Extracts an element width from a SIMD type.
+(decl pure element_width_from_type (Type) VecElementWidth)
+(rule (element_width_from_type ty)
+  (if-let $I8 (lane_type ty))
+  (VecElementWidth.E8))
+(rule (element_width_from_type ty)
+  (if-let $I16 (lane_type ty))
+  (VecElementWidth.E16))
+(rule (element_width_from_type ty)
+  (if-let $I32 (lane_type ty))
+  (VecElementWidth.E32))
+(rule (element_width_from_type ty)
+  (if-let $I64 (lane_type ty))
+  (VecElementWidth.E64))
+
+;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; As noted in the RISC-V Vector Extension Specification, rs2 is the first
+;; source register and rs1 is the second source register. This is the opposite
+;; of the usual RISC-V register order.
+;; See Section 10.1 of the RISC-V Vector Extension Specification.
+
+;; Helper for emitting `MInst.VecAluRRR` instructions.
+(decl vec_alu_rrr (VecAluOpRRR Reg Reg VState) Reg)
+(rule (vec_alu_rrr op vs2 vs1 vstate)
+      (let ((vd WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecAluRRR op vd vs2 vs1 vstate))))
+        vd))
+
+;; Helper for emitting `MInst.VecLoad` instructions.
+(decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg)
+(rule (vec_load eew from flags vstate)
+      (let ((vd WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecLoad eew vd from flags vstate))))
+        vd))
+
+;; Helper for emitting `MInst.VecStore` instructions.
+(decl vec_store (VecElementWidth VecAMode Reg MemFlags VState) InstOutput)
+(rule (vec_store eew to from flags vstate)
+      (side_effect
+        (SideEffectNoResult.Inst (MInst.VecStore eew to from flags vstate))))
+
+;; Helper for emitting the `vadd.vv` instruction.
+(decl rv_vadd_vv (Reg Reg VState) Reg)
+(rule (rv_vadd_vv vs2 vs1 vstate)
+  (vec_alu_rrr (VecAluOpRRR.Vadd) vs2 vs1 vstate))
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -53,7 +53,7 @@
 (rule (match_shnadd (u64_from_imm64 1)) (AluOPRRR.Sh1add))
 (rule (match_shnadd (u64_from_imm64 2)) (AluOPRRR.Sh2add))
 (rule (match_shnadd (u64_from_imm64 3)) (AluOPRRR.Sh3add))
-  
+
 (rule 3 (lower (has_type $I64 (iadd x (ishl y (maybe_uextend (iconst n))))))
  (if-let $true (has_zba))
  (if-let shnadd (match_shnadd n))
@@ -75,7 +75,7 @@
 (rule (match_shnadd_uw (u64_from_imm64 1)) (AluOPRRR.Sh1adduw))
 (rule (match_shnadd_uw (u64_from_imm64 2)) (AluOPRRR.Sh2adduw))
 (rule (match_shnadd_uw (u64_from_imm64 3)) (AluOPRRR.Sh3adduw))
-  
+
 (rule 5 (lower (has_type $I64 (iadd x (ishl (uextend y @ (value_type $I32)) (maybe_uextend (iconst n))))))
  (if-let $true (has_zba))
  (if-let shnadd_uw (match_shnadd_uw n))
@@ -97,6 +97,11 @@
        (high Reg (rv_add high_tmp carry)))
    (value_regs low high)))

+;; SIMD Vectors
+(rule 8 (lower (has_type (ty_vec128_int ty) (iadd x y)))
+  (if-let $true (has_v))
+  (rv_vadd_vv x y ty))
+
 ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;
 (rule
  (lower (has_type (fits_in_64 ty) (uadd_overflow_trap x y tc)))
@@ -374,7 +379,7 @@

 (rule 1 (lower (has_type $I128 (clz x)))
  (lower_clz_i128 x))
-  
+
 ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_64 ty) (cls x)))
  (lower_cls ty x))
@@ -809,6 +814,12 @@
  (lower (has_type $I128 (load flags p @ (value_type (ty_addr64 _)) offset)))
  (gen_load_128 p offset flags))

+(rule 2
+  (lower (has_type (ty_vec128_int ty) (load flags p @ (value_type (ty_addr64 _)) offset)))
+  (if-let $true (has_v))
+  (let ((eew VecElementWidth (element_width_from_type ty)))
+    (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags ty)))
+
 ;;;;;  Rules for `istore8`;;;;;;;;;
 (rule
  (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset))
@@ -833,6 +844,12 @@
  (lower (store flags x @ (value_type $I128 ) p @ (value_type (ty_addr64 _)) offset))
  (gen_store_128 p offset flags x))

+(rule 2
+  (lower (store flags x @ (value_type (ty_vec128_int ty)) p @ (value_type (ty_addr64 _)) offset))
+  (if-let $true (has_v))
+  (let ((eew VecElementWidth (element_width_from_type ty)))
+    (vec_store eew (VecAMode.UnitStride (gen_amode p offset $I64)) x flags ty)))
+
 (decl gen_icmp (IntCC ValueRegs ValueRegs Type) Reg)
 (rule
  (gen_icmp cc x y ty)
--- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -283,6 +283,10 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> {
        ValueRegs::two(shamt, len_sub_shamt)
    }

+    fn has_v(&mut self) -> bool {
+        self.backend.isa_flags.has_v()
+    }
+
    fn has_zbkb(&mut self) -> bool {
        self.backend.isa_flags.has_zbkb()
    }
@@ -428,6 +432,11 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Riscv64Backend> {
            rs2,
        }
    }
+
+    #[inline]
+    fn vstate_from_type(&mut self, ty: Type) -> VState {
+        VState::from_type(ty)
+    }
 }

 impl IsleContext<'_, '_, MInst, Riscv64Backend> {
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -288,6 +288,9 @@ pub trait MachInstEmitState<I: VCodeInst>: Default + Clone + Debug {
    /// Used to continue using a control plane after the emission state is
    /// not needed anymore.
    fn take_ctrl_plane(self) -> ControlPlane;
+    /// A hook that triggers when first emitting a new block.
+    /// It is guaranteed to be called before any instructions are emitted.
+    fn on_new_block(&mut self) {}
 }

 /// The result of a `MachBackend::compile_function()` call. Contains machine
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -843,6 +843,11 @@ impl<I: VCodeInst> VCode<I> {

        for (block_order_idx, &block) in final_order.iter().enumerate() {
            trace!("emitting block {:?}", block);
+
+            // Call the new block hook for state
+            state.on_new_block();
+
+            // Emit NOPs to align the block.
            let new_offset = I::align_basic_block(buffer.cur_offset());
            while new_offset > buffer.cur_offset() {
                // Pad with NOPs up to the aligned block offset.
--- a/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif
@@ -0,0 +1,578 @@
+test compile precise-output
+target riscv64 has_v
+
+;; Tests both ABI and Regalloc spill/reload.
+function %simd_spill(
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    ;; These cannot fit in registers.
+    i32x4, i32x4
+) ->
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4, i32x4,
+    ;; These cannot fit in registers.
+    i32x4, i32x4 system_v
+{
+block0(
+    v0:i32x4, v1:i32x4, v2:i32x4, v3:i32x4, v4:i32x4, v5:i32x4, v6:i32x4, v7:i32x4,
+    v8:i32x4, v9:i32x4, v10:i32x4, v11:i32x4, v12:i32x4, v13:i32x4, v14:i32x4, v15:i32x4,
+    v16:i32x4, v17:i32x4, v18:i32x4, v19:i32x4, v20:i32x4, v21:i32x4, v22:i32x4, v23:i32x4,
+    v24:i32x4, v25:i32x4, v26:i32x4, v27:i32x4, v28:i32x4, v29:i32x4, v30:i32x4, v31:i32x4,
+    v32:i32x4, v33:i32x4
+):
+    ;; This just reverses the args
+    return v33, v32,
+           v31, v30, v29, v28, v27, v26, v25, v24,
+           v23, v22, v21, v20, v19, v18, v17, v16,
+           v15, v14, v13, v12, v11, v10, v9, v8,
+           v7, v6, v5, v4, v3, v2, v1, v0
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+;   fsd fs0,-8(sp)
+;   fsd fs2,-16(sp)
+;   fsd fs3,-24(sp)
+;   fsd fs4,-32(sp)
+;   fsd fs5,-40(sp)
+;   fsd fs6,-48(sp)
+;   fsd fs7,-56(sp)
+;   fsd fs8,-64(sp)
+;   fsd fs9,-72(sp)
+;   fsd fs10,-80(sp)
+;   fsd fs11,-88(sp)
+;   add sp,-112
+; block0:
+;   fsd fa0,0(nominal_sp)
+;   fsd fa1,8(nominal_sp)
+;   vle8.v v28,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v29,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v30,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v31,64(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v0,80(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v1,96(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v2,112(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,128(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v5,144(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v7,160(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,176(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v6,192(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v25,208(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v27,224(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v9,240(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v19,256(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v21,272(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v23,288(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v26,304(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v8,320(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v18,336(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v20,352(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v22,368(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v24,384(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v11,400(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v10,416(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v24,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v22,16(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v20,32(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v18,48(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v8,64(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v26,80(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v23,96(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v21,112(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v19,128(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v9,144(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v27,160(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v25,176(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v6,192(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,208(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v7,224(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,240(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v3,256(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v2,272(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v1,288(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v0,304(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v31,320(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v30,336(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v29,352(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v28,368(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v17,384(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v16,400(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v15,416(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v14,432(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v13,448(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v12,464(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fld fa4,8(nominal_sp)
+;   vse8.v v14,480(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fld fa7,0(nominal_sp)
+;   vse8.v v17,496(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   add sp,+112
+;   fld fs0,-8(sp)
+;   fld fs2,-16(sp)
+;   fld fs3,-24(sp)
+;   fld fs4,-32(sp)
+;   fld fs5,-40(sp)
+;   fld fs6,-48(sp)
+;   fld fs7,-56(sp)
+;   fld fs8,-64(sp)
+;   fld fs9,-72(sp)
+;   fld fs10,-80(sp)
+;   fld fs11,-88(sp)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+;   fsd fs0, -8(sp)
+;   fsd fs2, -0x10(sp)
+;   fsd fs3, -0x18(sp)
+;   fsd fs4, -0x20(sp)
+;   fsd fs5, -0x28(sp)
+;   fsd fs6, -0x30(sp)
+;   fsd fs7, -0x38(sp)
+;   fsd fs8, -0x40(sp)
+;   fsd fs9, -0x48(sp)
+;   fsd fs10, -0x50(sp)
+;   fsd fs11, -0x58(sp)
+;   addi sp, sp, -0x70
+; block1: ; offset 0x40
+;   fsd fa0, 0(sp)
+;   fsd fa1, 8(sp)
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x10, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8e, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x20, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8e, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x30, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8f, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x40, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8f, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x50, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x80, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x60, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x70, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x80, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x90, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x82, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xa0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x83, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xb0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xc0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x83, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xd0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8c, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xe0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8d, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xf0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x89, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x10, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8a, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x20, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x8b, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x30, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8d, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x40, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x50, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x89, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x60, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8a, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x70, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8b, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x80, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x8c, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x90, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x87, 0x85, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xa0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, s0
+;   .byte 0x07, 0x85, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8c, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x10, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8b, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x20, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8a, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x30, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x89, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x40, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x50, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8d, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x60, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8b, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x70, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8a, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x80, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x89, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x90, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x84, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xa0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8d, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xb0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8c, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xc0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x83, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xd0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x82, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xe0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x83, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xf0, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x82, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x81, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x10, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x81, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x20, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x80, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x30, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x80, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x40, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8f, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x50, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8f, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x60, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x8e, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x70, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x8e, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x80, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x88, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x90, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x88, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xa0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x87, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xb0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x87, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xc0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x86, 0x0f, 0x02
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xd0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x86, 0x0f, 0x02
+;   fld fa4, 8(sp)
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xe0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x87, 0x0f, 0x02
+;   fld fa7, 0(sp)
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0xf0, 0x01, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0xa7, 0x88, 0x0f, 0x02
+;   addi sp, sp, 0x70
+;   fld fs0, -8(sp)
+;   fld fs2, -0x10(sp)
+;   fld fs3, -0x18(sp)
+;   fld fs4, -0x20(sp)
+;   fld fs5, -0x28(sp)
+;   fld fs6, -0x30(sp)
+;   fld fs7, -0x38(sp)
+;   fld fs8, -0x40(sp)
+;   fld fs9, -0x48(sp)
+;   fld fs10, -0x50(sp)
+;   fld fs11, -0x58(sp)
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
--- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
@@ -0,0 +1,73 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %iadd_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = iadd v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   vadd.vv v10,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x05, 0xb5, 0x02
+;   ret
+
+function %iadd_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = iadd v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   vadd.vv v10,v11,v10 #avl=8, #vtype=(e16, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x05, 0xb5, 0x02
+;   ret
+
+function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = iadd v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   vadd.vv v10,v11,v10 #avl=4, #vtype=(e32, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x05, 0xb5, 0x02
+;   ret
+
+function %iadd_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = iadd v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   vadd.vv v10,v11,v10 #avl=2, #vtype=(e64, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x05, 0xb5, 0x02
+;   ret
+
--- a/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif
@@ -0,0 +1,97 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %load_i8x16(i64) -> i8x16 {
+block0(v0: i64):
+    v1 = load.i8x16 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vle8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x07, 0x85, 0x0f, 0x02
+;   ret
+
+function %load_i16x8(i64) -> i16x8 {
+block0(v0: i64):
+    v1 = load.i16x8 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vle16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x07, 0xd5, 0x0f, 0x02
+;   ret
+
+function %load_i32x4(i64) -> i32x4 {
+block0(v0: i64):
+    v1 = load.i32x4 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vle32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x07, 0xe5, 0x0f, 0x02
+;   ret
+
+function %load_i64x2(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = load.i64x2 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x07, 0xf5, 0x0f, 0x02
+;   ret
+
--- a/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif
@@ -0,0 +1,97 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %store_i8x16(i64, i8x16) {
+block0(v0: i64, v1: i8x16):
+    store.i8x16 v1, v0
+    return
+}
+
+; VCode:
+; block0:
+;   vse8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0x85, 0x0f, 0x02
+;   ret
+
+function %store_i16x8(i64, i16x8) {
+block0(v0: i64, v1: i16x8):
+    store.i16x8 v1, v0
+    return
+}
+
+; VCode:
+; block0:
+;   vse16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0xd5, 0x0f, 0x02
+;   ret
+
+function %store_i32x4(i64, i32x4) {
+block0(v0: i64, v1: i32x4):
+    store.i32x4 v1, v0
+    return
+}
+
+; VCode:
+; block0:
+;   vse32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0xe5, 0x0f, 0x02
+;   ret
+
+function %store_i64x2(i64, i64x2) {
+block0(v0: i64, v1: i64x2):
+    store.i64x2 v1, v0
+    return
+}
+
+; VCode:
+; block0:
+;   vse64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   auipc t6, 0
+;   ld t6, 0xc(t6)
+;   j 0xc
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   .byte 0x00, 0x00, 0x00, 0x00
+;   add t6, t6, a0
+;   .byte 0x27, 0xf5, 0x0f, 0x02
+;   ret
+
--- a/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif
@@ -0,0 +1,68 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+;; Interleaves vector operations to ensure that `vsetivli` is emitted
+function %iadd_multi(i8x16, i16x8) -> i8x16, i16x8  {
+block0(v0: i8x16, v1: i16x8):
+    v4 = iadd v0, v0
+    v5 = iadd v1, v1
+    v6 = iadd v5, v5
+    return v4, v6
+}
+
+; VCode:
+; block0:
+;   vadd.vv v10,v10,v10 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vv v5,v11,v11 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vadd.vv v11,v5,v5 #avl=8, #vtype=(e16, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x05, 0xa5, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x82, 0xb5, 0x02
+;   .byte 0xd7, 0x85, 0x52, 0x02
+;   ret
+
+;; When the block changes, we need to reemit the vector state instruction
+;; Even if vtype is the same.
+function %(i8x16, i8x16) -> i8x16  {
+block0(v0: i8x16, v1: i8x16):
+    v2 = iadd v0, v1
+    jump block1(v1, v2)
+
+block1(v3: i8x16, v4: i8x16):
+    v5 = iadd v3, v4
+    jump block2(v4, v5)
+
+block2(v6: i8x16, v7: i8x16):
+    v8 = iadd v6, v7
+    return v8
+}
+
+; VCode:
+; block0:
+;   vadd.vv v5,v11,v10 #avl=16, #vtype=(e8, m1, ta, ma)
+;   j label1
+; block1:
+;   vadd.vv v6,v5,v11 #avl=16, #vtype=(e8, m1, ta, ma)
+;   j label2
+; block2:
+;   vadd.vv v10,v6,v5 #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xd7, 0x02, 0xb5, 0x02
+; block1: ; offset 0x8
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x83, 0x55, 0x02
+; block2: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x85, 0x62, 0x02
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
@@ -6,21 +6,6 @@ set enable_simd
 target x86_64
 target x86_64 skylake

-function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
-block0(v0:i32x4, v1:i32x4):
-    v2 = iadd v0, v1
-    return v2
-}
-; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5]
-
-function %iadd_i8x16_with_overflow() -> i8x16 {
-block0:
-    v0 = vconst.i8x16 [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255]
-    v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
-    v2 = iadd v0, v1
-    return v2
-}
-; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

 function %isub_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
--- a/cranelift/filetests/filetests/runtests/simd-iadd.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iadd.clif
@@ -0,0 +1,44 @@
+test interpret
+test run
+target aarch64
+target s390x
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+
+function %iadd_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0:i8x16, v1:i8x16):
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i8x16([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17]
+; run: %iadd_i8x16([2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+
+function %iadd_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0:i16x8, v1:i16x8):
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i16x8([1 1 1 1 1 1 1 1], [1 2 3 4 5 6 7 8]) == [2 3 4 5 6 7 8 9]
+; run: %iadd_i16x8([2 2 2 2 2 2 2 2], [-1 -1 -1 -1 -1 -1 -1 -1]) == [1 1 1 1 1 1 1 1]
+
+
+function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0:i32x4, v1:i32x4):
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5]
+; run: %iadd_i32x4([2 2 2 2], [-1 -1 -1 -1]) == [1 1 1 1]
+
+
+function %iadd_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0:i64x2, v1:i64x2):
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i64x2([1 1], [1 2]) == [2 3]
+; run: %iadd_i64x2([2 2], [-1 -1]) == [1 1]