[RFC] Dynamic Vector Support (#4200)

Introduce a new concept in the IR that allows a producer to create dynamic vector types. An IR function can now contain global value(s) that represent a dynamic scaling factor, for a given fixed-width vector type. A dynamic type is then created by 'multiplying' the corresponding global value with a fixed-width type. These new types can be used just like the existing types and the type system has a set of hard-coded dynamic types, such as I32X4XN, which the user defined types map onto. The dynamic types are also used explicitly to create dynamic stack slots, which have no set size like their existing counterparts. New IR instructions are added to access these new stack entities. Currently, during codegen, the dynamic scaling factor has to be lowered to a constant so the dynamic slots do eventually have a compile-time known size, as do spill slots. The current lowering for aarch64 just targets Neon, using a dynamic scale of 1. Copyright (c) 2022, Arm Limited.
2022-07-07 20:54:39 +01:00
parent 9ae060a12a
commit 9c43749dfe
69 changed files with 2422 additions and 294 deletions
--- a/cranelift/codegen/src/ir/dfg.rs
+++ b/cranelift/codegen/src/ir/dfg.rs
@@ -3,12 +3,13 @@
 use crate::entity::{self, PrimaryMap, SecondaryMap};
 use crate::ir;
 use crate::ir::builder::ReplaceBuilder;
+use crate::ir::dynamic_type::{DynamicTypeData, DynamicTypes};
 use crate::ir::extfunc::ExtFuncData;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionData};
 use crate::ir::{types, ConstantData, ConstantPool, Immediate};
 use crate::ir::{
-    Block, FuncRef, Inst, SigRef, Signature, SourceLoc, Type, Value, ValueLabelAssignments,
-    ValueList, ValueListPool,
+    Block, DynamicType, FuncRef, Inst, SigRef, Signature, SourceLoc, Type, Value,
+    ValueLabelAssignments, ValueList, ValueListPool,
 };
 use crate::packed_option::ReservedValue;
 use crate::write::write_operands;
@@ -50,6 +51,9 @@ pub struct DataFlowGraph {
    /// instructions contained in each block.
    blocks: PrimaryMap<Block, BlockData>,

+    /// Dynamic types created.
+    pub dynamic_types: DynamicTypes,
+
    /// Memory pool of value lists.
    ///
    /// The `ValueList` references into this pool appear in many places:
@@ -89,6 +93,7 @@ impl DataFlowGraph {
            insts: PrimaryMap::new(),
            results: SecondaryMap::new(),
            blocks: PrimaryMap::new(),
+            dynamic_types: DynamicTypes::new(),
            value_lists: ValueListPool::new(),
            values: PrimaryMap::new(),
            signatures: PrimaryMap::new(),
@@ -105,6 +110,7 @@ impl DataFlowGraph {
        self.insts.clear();
        self.results.clear();
        self.blocks.clear();
+        self.dynamic_types.clear();
        self.value_lists.clear();
        self.values.clear();
        self.signatures.clear();
@@ -557,6 +563,11 @@ impl DataFlowGraph {
        self.insts.push(data)
    }

+    /// Declares a dynamic vector type
+    pub fn make_dynamic_ty(&mut self, data: DynamicTypeData) -> DynamicType {
+        self.dynamic_types.push(data)
+    }
+
    /// Returns an object that displays `inst`.
    pub fn display_inst<'a>(&'a self, inst: Inst) -> DisplayInst<'a> {
        DisplayInst(self, inst)
@@ -1104,6 +1115,20 @@ impl DataFlowGraph {
        self.values[v].set_type(t);
    }

+    /// Check that the given concrete `Type` has been defined in the function.
+    pub fn check_dynamic_type(&mut self, ty: Type) -> Option<Type> {
+        debug_assert!(ty.is_dynamic_vector());
+        if self
+            .dynamic_types
+            .values()
+            .any(|dyn_ty_data| dyn_ty_data.concrete().unwrap() == ty)
+        {
+            Some(ty)
+        } else {
+            None
+        }
+    }
+
    /// Create result values for `inst`, reusing the provided detached values.
    /// This is similar to `make_inst_results_reusing` except it's only for use
    /// in the parser, which needs to reuse previously invalid values.
@@ -1130,6 +1155,10 @@ impl DataFlowGraph {
            let constraints = self.insts[inst].opcode().constraints();
            for res_idx in 0..constraints.num_fixed_results() {
                let ty = constraints.result_type(res_idx, ctrl_typevar);
+                if ty.is_dynamic_vector() {
+                    self.check_dynamic_type(ty)
+                        .unwrap_or_else(|| panic!("Use of undeclared dynamic type: {}", ty));
+                }
                if let Some(v) = reuse.get(res_idx) {
                    self.set_value_type_for_parser(*v, ty);
                }
--- a/cranelift/codegen/src/ir/dynamic_type.rs
+++ b/cranelift/codegen/src/ir/dynamic_type.rs
@@ -0,0 +1,38 @@
+//! Dynamic IR types
+
+use crate::ir::entities::DynamicType;
+use crate::ir::GlobalValue;
+use crate::ir::PrimaryMap;
+use crate::ir::Type;
+
+#[cfg(feature = "enable-serde")]
+use serde::{Deserialize, Serialize};
+
+/// A dynamic type object which has a base vector type and a scaling factor.
+#[derive(Clone)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct DynamicTypeData {
+    /// Base vector type, this is the minimum size of the type.
+    pub base_vector_ty: Type,
+    /// The dynamic scaling factor of the base vector type.
+    pub dynamic_scale: GlobalValue,
+}
+
+impl DynamicTypeData {
+    /// Create a new dynamic type.
+    pub fn new(base_vector_ty: Type, dynamic_scale: GlobalValue) -> Self {
+        assert!(base_vector_ty.is_vector());
+        Self {
+            base_vector_ty,
+            dynamic_scale,
+        }
+    }
+
+    /// Convert 'base_vector_ty' into a concrete dynamic vector type.
+    pub fn concrete(&self) -> Option<Type> {
+        self.base_vector_ty.vector_to_dynamic()
+    }
+}
+
+/// All allocated dynamic types.
+pub type DynamicTypes = PrimaryMap<DynamicType, DynamicTypeData>;
--- a/cranelift/codegen/src/ir/entities.rs
+++ b/cranelift/codegen/src/ir/entities.rs
@@ -135,6 +135,44 @@ impl StackSlot {
    }
 }

+/// An opaque reference to a dynamic stack slot.
+#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct DynamicStackSlot(u32);
+entity_impl!(DynamicStackSlot, "dss");
+
+impl DynamicStackSlot {
+    /// Create a new stack slot reference from its number.
+    ///
+    /// This method is for use by the parser.
+    pub fn with_number(n: u32) -> Option<Self> {
+        if n < u32::MAX {
+            Some(Self(n))
+        } else {
+            None
+        }
+    }
+}
+
+/// An opaque reference to a dynamic type.
+#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct DynamicType(u32);
+entity_impl!(DynamicType, "dt");
+
+impl DynamicType {
+    /// Create a new dynamic type reference from its number.
+    ///
+    /// This method is for use by the parser.
+    pub fn with_number(n: u32) -> Option<Self> {
+        if n < u32::MAX {
+            Some(Self(n))
+        } else {
+            None
+        }
+    }
+}
+
 /// An opaque reference to a global value.
 ///
 /// A `GlobalValue` is a [`Value`](Value) that will be live across the entire
@@ -389,6 +427,10 @@ pub enum AnyEntity {
    Value(Value),
    /// A stack slot.
    StackSlot(StackSlot),
+    /// A dynamic stack slot.
+    DynamicStackSlot(DynamicStackSlot),
+    /// A dynamic type
+    DynamicType(DynamicType),
    /// A Global value.
    GlobalValue(GlobalValue),
    /// A jump table.
@@ -415,6 +457,8 @@ impl fmt::Display for AnyEntity {
            Self::Inst(r) => r.fmt(f),
            Self::Value(r) => r.fmt(f),
            Self::StackSlot(r) => r.fmt(f),
+            Self::DynamicStackSlot(r) => r.fmt(f),
+            Self::DynamicType(r) => r.fmt(f),
            Self::GlobalValue(r) => r.fmt(f),
            Self::JumpTable(r) => r.fmt(f),
            Self::Constant(r) => r.fmt(f),
@@ -457,6 +501,18 @@ impl From<StackSlot> for AnyEntity {
    }
 }

+impl From<DynamicStackSlot> for AnyEntity {
+    fn from(r: DynamicStackSlot) -> Self {
+        Self::DynamicStackSlot(r)
+    }
+}
+
+impl From<DynamicType> for AnyEntity {
+    fn from(r: DynamicType) -> Self {
+        Self::DynamicType(r)
+    }
+}
+
 impl From<GlobalValue> for AnyEntity {
    fn from(r: GlobalValue) -> Self {
        Self::GlobalValue(r)
--- a/cranelift/codegen/src/ir/function.rs
+++ b/cranelift/codegen/src/ir/function.rs
@@ -7,12 +7,12 @@ use crate::entity::{PrimaryMap, SecondaryMap};
 use crate::ir;
 use crate::ir::JumpTables;
 use crate::ir::{
-    instructions::BranchInfo, Block, ExtFuncData, FuncRef, GlobalValue, GlobalValueData, Heap,
-    HeapData, Inst, InstructionData, JumpTable, JumpTableData, Opcode, SigRef, StackSlot,
-    StackSlotData, Table, TableData,
+    instructions::BranchInfo, Block, DynamicStackSlot, DynamicStackSlotData, DynamicType,
+    ExtFuncData, FuncRef, GlobalValue, GlobalValueData, Heap, HeapData, Inst, InstructionData,
+    JumpTable, JumpTableData, Opcode, SigRef, StackSlot, StackSlotData, Table, TableData, Type,
 };
 use crate::ir::{DataFlowGraph, ExternalName, Layout, Signature};
-use crate::ir::{SourceLocs, StackSlots};
+use crate::ir::{DynamicStackSlots, SourceLocs, StackSlots};
 use crate::isa::CallConv;
 use crate::value_label::ValueLabelsRanges;
 use crate::write::write_function;
@@ -78,8 +78,11 @@ pub struct Function {
    /// Signature of this function.
    pub signature: Signature,

-    /// Stack slots allocated in this function.
-    pub stack_slots: StackSlots,
+    /// Sized stack slots allocated in this function.
+    pub sized_stack_slots: StackSlots,
+
+    /// Dynamic stack slots allocated in this function.
+    pub dynamic_stack_slots: DynamicStackSlots,

    /// Global values referenced.
    pub global_values: PrimaryMap<ir::GlobalValue, ir::GlobalValueData>,
@@ -120,7 +123,8 @@ impl Function {
            version_marker: VersionMarker,
            name,
            signature: sig,
-            stack_slots: StackSlots::new(),
+            sized_stack_slots: StackSlots::new(),
+            dynamic_stack_slots: DynamicStackSlots::new(),
            global_values: PrimaryMap::new(),
            heaps: PrimaryMap::new(),
            tables: PrimaryMap::new(),
@@ -135,7 +139,8 @@ impl Function {
    /// Clear all data structures in this function.
    pub fn clear(&mut self) {
        self.signature.clear(CallConv::Fast);
-        self.stack_slots.clear();
+        self.sized_stack_slots.clear();
+        self.dynamic_stack_slots.clear();
        self.global_values.clear();
        self.heaps.clear();
        self.tables.clear();
@@ -156,10 +161,16 @@ impl Function {
        self.jump_tables.push(data)
    }

-    /// Creates a stack slot in the function, to be used by `stack_load`, `stack_store` and
-    /// `stack_addr` instructions.
-    pub fn create_stack_slot(&mut self, data: StackSlotData) -> StackSlot {
-        self.stack_slots.push(data)
+    /// Creates a sized stack slot in the function, to be used by `stack_load`, `stack_store`
+    /// and `stack_addr` instructions.
+    pub fn create_sized_stack_slot(&mut self, data: StackSlotData) -> StackSlot {
+        self.sized_stack_slots.push(data)
+    }
+
+    /// Creates a dynamic stack slot in the function, to be used by `dynamic_stack_load`,
+    /// `dynamic_stack_store` and `dynamic_stack_addr` instructions.
+    pub fn create_dynamic_stack_slot(&mut self, data: DynamicStackSlotData) -> DynamicStackSlot {
+        self.dynamic_stack_slots.push(data)
    }

    /// Adds a signature which can later be used to declare an external function import.
@@ -177,6 +188,26 @@ impl Function {
        self.global_values.push(data)
    }

+    /// Find the global dyn_scale value associated with given DynamicType
+    pub fn get_dyn_scale(&self, ty: DynamicType) -> GlobalValue {
+        self.dfg.dynamic_types.get(ty).unwrap().dynamic_scale
+    }
+
+    /// Find the global dyn_scale for the given stack slot.
+    pub fn get_dynamic_slot_scale(&self, dss: DynamicStackSlot) -> GlobalValue {
+        let dyn_ty = self.dynamic_stack_slots.get(dss).unwrap().dyn_ty;
+        self.get_dyn_scale(dyn_ty)
+    }
+
+    /// Get a concrete `Type` from a user defined `DynamicType`.
+    pub fn get_concrete_dynamic_ty(&self, ty: DynamicType) -> Option<Type> {
+        self.dfg
+            .dynamic_types
+            .get(ty)
+            .unwrap_or_else(|| panic!("Undeclared dynamic vector type: {}", ty))
+            .concrete()
+    }
+
    /// Declares a heap accessible to the function.
    pub fn create_heap(&mut self, data: HeapData) -> Heap {
        self.heaps.push(data)
@@ -322,8 +353,8 @@ impl Function {
    /// Size occupied by all stack slots associated with this function.
    ///
    /// Does not include any padding necessary due to offsets
-    pub fn stack_size(&self) -> u32 {
-        self.stack_slots.values().map(|ss| ss.size).sum()
+    pub fn fixed_stack_size(&self) -> u32 {
+        self.sized_stack_slots.values().map(|ss| ss.size).sum()
    }
 }

--- a/cranelift/codegen/src/ir/globalvalue.rs
+++ b/cranelift/codegen/src/ir/globalvalue.rs
@@ -76,6 +76,13 @@ pub enum GlobalValueData {
        /// Does this symbol refer to a thread local storage value?
        tls: bool,
    },
+
+    /// Value is a multiple of how many instances of `vector_type` will fit in
+    /// a target vector register.
+    DynScaleTargetConst {
+        /// Base vector type.
+        vector_type: Type,
+    },
 }

 impl GlobalValueData {
@@ -92,6 +99,7 @@ impl GlobalValueData {
        match *self {
            Self::VMContext { .. } | Self::Symbol { .. } => isa.pointer_type(),
            Self::IAddImm { global_type, .. } | Self::Load { global_type, .. } => global_type,
+            Self::DynScaleTargetConst { .. } => isa.pointer_type(),
        }
    }

@@ -154,6 +162,9 @@ impl fmt::Display for GlobalValueData {
                }
                Ok(())
            }
+            Self::DynScaleTargetConst { vector_type } => {
+                write!(f, "dyn_scale_target_const.{}", vector_type)
+            }
        }
    }
 }
--- a/cranelift/codegen/src/ir/instructions.rs
+++ b/cranelift/codegen/src/ir/instructions.rs
@@ -633,6 +633,8 @@ pub struct ValueTypeSet {
    pub bools: BitSet8,
    /// Allowed ref widths
    pub refs: BitSet8,
+    /// Allowed dynamic vectors minimum lane sizes
+    pub dynamic_lanes: BitSet16,
 }

 impl ValueTypeSet {
@@ -656,8 +658,13 @@ impl ValueTypeSet {

    /// Does `typ` belong to this set?
    pub fn contains(self, typ: Type) -> bool {
-        let l2l = typ.log2_lane_count();
-        self.lanes.contains(l2l) && self.is_base_type(typ.lane_type())
+        if typ.is_dynamic_vector() {
+            let l2l = typ.log2_min_lane_count();
+            self.dynamic_lanes.contains(l2l) && self.is_base_type(typ.lane_type())
+        } else {
+            let l2l = typ.log2_lane_count();
+            self.lanes.contains(l2l) && self.is_base_type(typ.lane_type())
+        }
    }

    /// Get an example member of this type set.
@@ -712,6 +719,9 @@ enum OperandConstraint {

    /// This operand is `ctrlType.merge_lanes()`.
    MergeLanes,
+
+    /// This operands is `ctrlType.dynamic_to_vector()`.
+    DynamicToVector,
 }

 impl OperandConstraint {
@@ -738,15 +748,48 @@ impl OperandConstraint {
                    .expect("invalid type for half_vector"),
            ),
            DoubleVector => Bound(ctrl_type.by(2).expect("invalid type for double_vector")),
-            SplitLanes => Bound(
+            SplitLanes => {
+                if ctrl_type.is_dynamic_vector() {
+                    Bound(
+                        ctrl_type
+                            .dynamic_to_vector()
+                            .expect("invalid type for dynamic_to_vector")
+                            .split_lanes()
+                            .expect("invalid type for split_lanes")
+                            .vector_to_dynamic()
+                            .expect("invalid dynamic type"),
+                    )
+                } else {
+                    Bound(
+                        ctrl_type
+                            .split_lanes()
+                            .expect("invalid type for split_lanes"),
+                    )
+                }
+            }
+            MergeLanes => {
+                if ctrl_type.is_dynamic_vector() {
+                    Bound(
+                        ctrl_type
+                            .dynamic_to_vector()
+                            .expect("invalid type for dynamic_to_vector")
+                            .merge_lanes()
+                            .expect("invalid type for merge_lanes")
+                            .vector_to_dynamic()
+                            .expect("invalid dynamic type"),
+                    )
+                } else {
+                    Bound(
+                        ctrl_type
+                            .merge_lanes()
+                            .expect("invalid type for merge_lanes"),
+                    )
+                }
+            }
+            DynamicToVector => Bound(
                ctrl_type
-                    .split_lanes()
-                    .expect("invalid type for split_lanes"),
-            ),
-            MergeLanes => Bound(
-                ctrl_type
-                    .merge_lanes()
-                    .expect("invalid type for merge_lanes"),
+                    .dynamic_to_vector()
+                    .expect("invalid type for dynamic_to_vector"),
            ),
        }
    }
@@ -860,11 +903,13 @@ mod tests {
            floats: BitSet8::from_range(0, 0),
            bools: BitSet8::from_range(3, 7),
            refs: BitSet8::from_range(5, 7),
+            dynamic_lanes: BitSet16::from_range(0, 4),
        };
        assert!(!vts.contains(I8));
        assert!(vts.contains(I32));
        assert!(vts.contains(I64));
        assert!(vts.contains(I32X4));
+        assert!(vts.contains(I32X4XN));
        assert!(!vts.contains(F32));
        assert!(!vts.contains(B1));
        assert!(vts.contains(B8));
@@ -879,6 +924,7 @@ mod tests {
            floats: BitSet8::from_range(5, 7),
            bools: BitSet8::from_range(3, 7),
            refs: BitSet8::from_range(0, 0),
+            dynamic_lanes: BitSet16::from_range(0, 8),
        };
        assert_eq!(vts.example().to_string(), "f32");

@@ -888,6 +934,7 @@ mod tests {
            floats: BitSet8::from_range(5, 7),
            bools: BitSet8::from_range(3, 7),
            refs: BitSet8::from_range(0, 0),
+            dynamic_lanes: BitSet16::from_range(0, 8),
        };
        assert_eq!(vts.example().to_string(), "f32x2");

@@ -897,9 +944,11 @@ mod tests {
            floats: BitSet8::from_range(0, 0),
            bools: BitSet8::from_range(3, 7),
            refs: BitSet8::from_range(0, 0),
+            dynamic_lanes: BitSet16::from_range(0, 8),
        };
        assert!(!vts.contains(B32X2));
        assert!(vts.contains(B32X4));
+        assert!(vts.contains(B16X4XN));
        assert_eq!(vts.example().to_string(), "b32x4");

        let vts = ValueTypeSet {
@@ -909,6 +958,7 @@ mod tests {
            floats: BitSet8::from_range(0, 0),
            bools: BitSet8::from_range(0, 0),
            refs: BitSet8::from_range(0, 0),
+            dynamic_lanes: BitSet16::from_range(0, 8),
        };
        assert!(vts.contains(I32));
        assert!(vts.contains(I32X4));
--- a/cranelift/codegen/src/ir/mod.rs
+++ b/cranelift/codegen/src/ir/mod.rs
@@ -5,6 +5,7 @@ mod builder;
 pub mod condcodes;
 pub mod constant;
 pub mod dfg;
+pub mod dynamic_type;
 pub mod entities;
 mod extfunc;
 mod extname;
@@ -33,9 +34,10 @@ pub use crate::ir::builder::{
 };
 pub use crate::ir::constant::{ConstantData, ConstantPool};
 pub use crate::ir::dfg::{DataFlowGraph, ValueDef};
+pub use crate::ir::dynamic_type::{DynamicTypeData, DynamicTypes};
 pub use crate::ir::entities::{
-    Block, Constant, FuncRef, GlobalValue, Heap, Immediate, Inst, JumpTable, SigRef, StackSlot,
-    Table, Value,
+    Block, Constant, DynamicStackSlot, DynamicType, FuncRef, GlobalValue, Heap, Immediate, Inst,
+    JumpTable, SigRef, StackSlot, Table, Value,
 };
 pub use crate::ir::extfunc::{
    AbiParam, ArgumentExtension, ArgumentPurpose, ExtFuncData, Signature,
@@ -53,7 +55,9 @@ pub use crate::ir::libcall::{get_probestack_funcref, LibCall};
 pub use crate::ir::memflags::{Endianness, MemFlags};
 pub use crate::ir::progpoint::{ExpandedProgramPoint, ProgramOrder, ProgramPoint};
 pub use crate::ir::sourceloc::SourceLoc;
-pub use crate::ir::stackslot::{StackSlotData, StackSlotKind, StackSlots};
+pub use crate::ir::stackslot::{
+    DynamicStackSlotData, DynamicStackSlots, StackSlotData, StackSlotKind, StackSlots,
+};
 pub use crate::ir::table::TableData;
 pub use crate::ir::trapcode::TrapCode;
 pub use crate::ir::types::Type;
--- a/cranelift/codegen/src/ir/stackslot.rs
+++ b/cranelift/codegen/src/ir/stackslot.rs
@@ -4,10 +4,18 @@
 //!

 use crate::entity::PrimaryMap;
+use crate::ir::entities::{DynamicStackSlot, DynamicType};
 use crate::ir::StackSlot;
 use core::fmt;
 use core::str::FromStr;

+/// imports only needed for testing.
+#[allow(unused_imports)]
+use crate::ir::{DynamicTypeData, GlobalValueData};
+
+#[allow(unused_imports)]
+use crate::ir::types::*;
+
 #[cfg(feature = "enable-serde")]
 use serde::{Deserialize, Serialize};

@@ -25,6 +33,9 @@ pub enum StackSlotKind {
    /// An explicit stack slot. This is a chunk of stack memory for use by the `stack_load`
    /// and `stack_store` instructions.
    ExplicitSlot,
+    /// An explicit stack slot for dynamic vector types. This is a chunk of stack memory
+    /// for use by the `dynamic_stack_load` and `dynamic_stack_store` instructions.
+    ExplicitDynamicSlot,
 }

 impl FromStr for StackSlotKind {
@@ -34,6 +45,7 @@ impl FromStr for StackSlotKind {
        use self::StackSlotKind::*;
        match s {
            "explicit_slot" => Ok(ExplicitSlot),
+            "explicit_dynamic_slot" => Ok(ExplicitDynamicSlot),
            _ => Err(()),
        }
    }
@@ -44,6 +56,7 @@ impl fmt::Display for StackSlotKind {
        use self::StackSlotKind::*;
        f.write_str(match *self {
            ExplicitSlot => "explicit_slot",
+            ExplicitDynamicSlot => "explicit_dynamic_slot",
        })
    }
 }
@@ -68,11 +81,15 @@ impl StackSlotData {
    /// Get the alignment in bytes of this stack slot given the stack pointer alignment.
    pub fn alignment(&self, max_align: StackSize) -> StackSize {
        debug_assert!(max_align.is_power_of_two());
-        // We want to find the largest power of two that divides both `self.size` and `max_align`.
-        // That is the same as isolating the rightmost bit in `x`.
-        let x = self.size | max_align;
-        // C.f. Hacker's delight.
-        x & x.wrapping_neg()
+        if self.kind == StackSlotKind::ExplicitDynamicSlot {
+            max_align
+        } else {
+            // We want to find the largest power of two that divides both `self.size` and `max_align`.
+            // That is the same as isolating the rightmost bit in `x`.
+            let x = self.size | max_align;
+            // C.f. Hacker's delight.
+            x & x.wrapping_neg()
+        }
    }
 }

@@ -82,9 +99,43 @@ impl fmt::Display for StackSlotData {
    }
 }

+/// Contents of a dynamic stack slot.
+#[derive(Clone, Debug, PartialEq, Eq)]
+#[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
+pub struct DynamicStackSlotData {
+    /// The kind of stack slot.
+    pub kind: StackSlotKind,
+
+    /// The type of this slot.
+    pub dyn_ty: DynamicType,
+}
+
+impl DynamicStackSlotData {
+    /// Create a stack slot with the specified byte size.
+    pub fn new(kind: StackSlotKind, dyn_ty: DynamicType) -> Self {
+        assert!(kind == StackSlotKind::ExplicitDynamicSlot);
+        Self { kind, dyn_ty }
+    }
+
+    /// Get the alignment in bytes of this stack slot given the stack pointer alignment.
+    pub fn alignment(&self, max_align: StackSize) -> StackSize {
+        debug_assert!(max_align.is_power_of_two());
+        max_align
+    }
+}
+
+impl fmt::Display for DynamicStackSlotData {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{} {}", self.kind, self.dyn_ty)
+    }
+}
+
 /// All allocated stack slots.
 pub type StackSlots = PrimaryMap<StackSlot, StackSlotData>;

+/// All allocated dynamic stack slots.
+pub type DynamicStackSlots = PrimaryMap<DynamicStackSlot, DynamicStackSlotData>;
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -95,16 +146,56 @@ mod tests {
    fn stack_slot() {
        let mut func = Function::new();

-        let ss0 = func.create_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 4));
-        let ss1 = func.create_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 8));
+        let ss0 = func.create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 4));
+        let ss1 = func.create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 8));
        assert_eq!(ss0.to_string(), "ss0");
        assert_eq!(ss1.to_string(), "ss1");

-        assert_eq!(func.stack_slots[ss0].size, 4);
-        assert_eq!(func.stack_slots[ss1].size, 8);
+        assert_eq!(func.sized_stack_slots[ss0].size, 4);
+        assert_eq!(func.sized_stack_slots[ss1].size, 8);

-        assert_eq!(func.stack_slots[ss0].to_string(), "explicit_slot 4");
-        assert_eq!(func.stack_slots[ss1].to_string(), "explicit_slot 8");
+        assert_eq!(func.sized_stack_slots[ss0].to_string(), "explicit_slot 4");
+        assert_eq!(func.sized_stack_slots[ss1].to_string(), "explicit_slot 8");
+    }
+
+    #[test]
+    fn dynamic_stack_slot() {
+        let mut func = Function::new();
+
+        let int_vector_ty = I32X4;
+        let fp_vector_ty = F64X2;
+        let scale0 = GlobalValueData::DynScaleTargetConst {
+            vector_type: int_vector_ty,
+        };
+        let scale1 = GlobalValueData::DynScaleTargetConst {
+            vector_type: fp_vector_ty,
+        };
+        let gv0 = func.create_global_value(scale0);
+        let gv1 = func.create_global_value(scale1);
+        let dtd0 = DynamicTypeData::new(int_vector_ty, gv0);
+        let dtd1 = DynamicTypeData::new(fp_vector_ty, gv1);
+        let dt0 = func.dfg.make_dynamic_ty(dtd0);
+        let dt1 = func.dfg.make_dynamic_ty(dtd1);
+
+        let dss0 = func.create_dynamic_stack_slot(DynamicStackSlotData::new(
+            StackSlotKind::ExplicitDynamicSlot,
+            dt0,
+        ));
+        let dss1 = func.create_dynamic_stack_slot(DynamicStackSlotData::new(
+            StackSlotKind::ExplicitDynamicSlot,
+            dt1,
+        ));
+        assert_eq!(dss0.to_string(), "dss0");
+        assert_eq!(dss1.to_string(), "dss1");
+
+        assert_eq!(
+            func.dynamic_stack_slots[dss0].to_string(),
+            "explicit_dynamic_slot dt0"
+        );
+        assert_eq!(
+            func.dynamic_stack_slots[dss1].to_string(),
+            "explicit_dynamic_slot dt1"
+        );
    }

    #[test]
--- a/cranelift/codegen/src/ir/types.rs
+++ b/cranelift/codegen/src/ir/types.rs
@@ -233,7 +233,12 @@ impl Type {
    ///
    /// A vector type has 2 or more lanes.
    pub fn is_vector(self) -> bool {
-        self.0 >= constants::VECTOR_BASE
+        self.0 >= constants::VECTOR_BASE && !self.is_dynamic_vector()
+    }
+
+    /// Is this a SIMD vector type with a runtime number of lanes?
+    pub fn is_dynamic_vector(self) -> bool {
+        self.0 >= constants::DYNAMIC_VECTOR_BASE
    }

    /// Is this a scalar boolean type?
@@ -288,19 +293,62 @@ impl Type {
    ///
    /// A scalar type is the same as a SIMD vector type with one lane, so it returns 0.
    pub fn log2_lane_count(self) -> u32 {
-        (self.0.saturating_sub(constants::LANE_BASE) >> 4) as u32
+        if self.is_dynamic_vector() {
+            0
+        } else {
+            (self.0.saturating_sub(constants::LANE_BASE) >> 4) as u32
+        }
+    }
+
+    /// Get log_2 of the number of lanes in this vector/dynamic type.
+    pub fn log2_min_lane_count(self) -> u32 {
+        if self.is_dynamic_vector() {
+            (self
+                .0
+                .saturating_sub(constants::VECTOR_BASE + constants::LANE_BASE)
+                >> 4) as u32
+        } else {
+            self.log2_lane_count()
+        }
    }

    /// Get the number of lanes in this SIMD vector type.
    ///
    /// A scalar type is the same as a SIMD vector type with one lane, so it returns 1.
    pub fn lane_count(self) -> u32 {
-        1 << self.log2_lane_count()
+        if self.is_dynamic_vector() {
+            0
+        } else {
+            1 << self.log2_lane_count()
+        }
    }

    /// Get the total number of bits used to represent this type.
    pub fn bits(self) -> u32 {
-        self.lane_bits() * self.lane_count()
+        if self.is_dynamic_vector() {
+            0
+        } else {
+            self.lane_bits() * self.lane_count()
+        }
+    }
+
+    /// Get the minimum of lanes in this SIMD vector type, this supports both fixed and
+    /// dynamic types.
+    pub fn min_lane_count(self) -> u32 {
+        if self.is_dynamic_vector() {
+            1 << self.log2_min_lane_count()
+        } else {
+            1 << self.log2_lane_count()
+        }
+    }
+
+    /// Get the minimum number of bits used to represent this type.
+    pub fn min_bits(self) -> u32 {
+        if self.is_dynamic_vector() {
+            self.lane_bits() * self.min_lane_count()
+        } else {
+            self.bits()
+        }
    }

    /// Get the number of bytes used to store this type in memory.
@@ -315,23 +363,46 @@ impl Type {
    /// If this is already a SIMD vector type, this produces a SIMD vector type with `n *
    /// self.lane_count()` lanes.
    pub fn by(self, n: u32) -> Option<Self> {
+        if self.is_dynamic_vector() {
+            return None;
+        }
        if self.lane_bits() == 0 || !n.is_power_of_two() {
            return None;
        }
        let log2_lanes: u32 = n.trailing_zeros();
        let new_type = u32::from(self.0) + (log2_lanes << 4);
-        if new_type < 0x100 {
+        if new_type < constants::DYNAMIC_VECTOR_BASE as u32
+            && (new_type as u16) < constants::DYNAMIC_VECTOR_BASE
+        {
            Some(Self(new_type as u16))
        } else {
            None
        }
    }

+    /// Convert a fixed vector type to a dynamic one.
+    pub fn vector_to_dynamic(self) -> Option<Self> {
+        assert!(self.is_vector());
+        if self.bits() > 256 {
+            return None;
+        }
+        let new_ty = self.0 + constants::VECTOR_BASE;
+        let ty = Some(Self(new_ty));
+        assert!(ty.unwrap().is_dynamic_vector());
+        return ty;
+    }
+
+    /// Convert a dynamic vector type to a fixed one.
+    pub fn dynamic_to_vector(self) -> Option<Self> {
+        assert!(self.is_dynamic_vector());
+        Some(Self(self.0 - constants::VECTOR_BASE))
+    }
+
    /// Get a SIMD vector with half the number of lanes.
    ///
    /// There is no `double_vector()` method. Use `t.by(2)` instead.
    pub fn half_vector(self) -> Option<Self> {
-        if self.is_vector() {
+        if self.is_vector() && !self.is_dynamic_vector() {
            Some(Self(self.0 - 0x10))
        } else {
            None
@@ -418,6 +489,8 @@ impl Display for Type {
            write!(f, "f{}", self.lane_bits())
        } else if self.is_vector() {
            write!(f, "{}x{}", self.lane_type(), self.lane_count())
+        } else if self.is_dynamic_vector() {
+            write!(f, "{:?}x{}xN", self.lane_type(), self.min_lane_count())
        } else if self.is_ref() {
            write!(f, "r{}", self.lane_bits())
        } else {
@@ -441,6 +514,8 @@ impl Debug for Type {
            write!(f, "types::F{}", self.lane_bits())
        } else if self.is_vector() {
            write!(f, "{:?}X{}", self.lane_type(), self.lane_count())
+        } else if self.is_dynamic_vector() {
+            write!(f, "{:?}X{}XN", self.lane_type(), self.min_lane_count())
        } else if self.is_ref() {
            write!(f, "types::R{}", self.lane_bits())
        } else {
@@ -568,6 +643,55 @@ mod tests {
        assert_eq!(F64.by(8), Some(F64X8));
    }

+    #[test]
+    fn dynamic_vectors() {
+        // Identification.
+        assert_eq!(I8X16XN.is_dynamic_vector(), true);
+        assert_eq!(B16X4XN.is_dynamic_vector(), true);
+        assert_eq!(F32X8XN.is_dynamic_vector(), true);
+        assert_eq!(F64X4XN.is_dynamic_vector(), true);
+        assert_eq!(I128X2XN.is_dynamic_vector(), true);
+
+        // Lane counts.
+        assert_eq!(I16X8XN.lane_count(), 0);
+        assert_eq!(I16X8XN.min_lane_count(), 8);
+
+        // Size
+        assert_eq!(B32X2XN.bits(), 0);
+        assert_eq!(B32X2XN.min_bits(), 64);
+
+        // Change lane counts
+        assert_eq!(F64X4XN.half_vector(), None);
+        assert_eq!(I8X8XN.by(2), None);
+
+        // Conversions to and from vectors.
+        assert_eq!(B8.by(8).unwrap().vector_to_dynamic(), Some(B8X8XN));
+        assert_eq!(I8.by(16).unwrap().vector_to_dynamic(), Some(I8X16XN));
+        assert_eq!(I16.by(8).unwrap().vector_to_dynamic(), Some(I16X8XN));
+        assert_eq!(B16.by(16).unwrap().vector_to_dynamic(), Some(B16X16XN));
+        assert_eq!(B32.by(2).unwrap().vector_to_dynamic(), Some(B32X2XN));
+        assert_eq!(B32.by(8).unwrap().vector_to_dynamic(), Some(B32X8XN));
+        assert_eq!(I32.by(4).unwrap().vector_to_dynamic(), Some(I32X4XN));
+        assert_eq!(F32.by(4).unwrap().vector_to_dynamic(), Some(F32X4XN));
+        assert_eq!(F64.by(2).unwrap().vector_to_dynamic(), Some(F64X2XN));
+        assert_eq!(I128.by(2).unwrap().vector_to_dynamic(), Some(I128X2XN));
+
+        assert_eq!(I128X2XN.dynamic_to_vector(), Some(I128X2));
+        assert_eq!(B64X2XN.dynamic_to_vector(), Some(B64X2));
+        assert_eq!(F32X4XN.dynamic_to_vector(), Some(F32X4));
+        assert_eq!(F64X4XN.dynamic_to_vector(), Some(F64X4));
+        assert_eq!(I32X2XN.dynamic_to_vector(), Some(I32X2));
+        assert_eq!(I32X8XN.dynamic_to_vector(), Some(I32X8));
+        assert_eq!(I16X16XN.dynamic_to_vector(), Some(I16X16));
+        assert_eq!(I8X32XN.dynamic_to_vector(), Some(I8X32));
+
+        assert_eq!(I8X64.vector_to_dynamic(), None);
+        assert_eq!(B16X32.vector_to_dynamic(), None);
+        assert_eq!(F32X16.vector_to_dynamic(), None);
+        assert_eq!(I64X8.vector_to_dynamic(), None);
+        assert_eq!(I128X4.vector_to_dynamic(), None);
+    }
+
    #[test]
    fn format_scalars() {
        assert_eq!(IFLAGS.to_string(), "iflags");
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -5,7 +5,7 @@ use crate::ir::types;
 use crate::ir::types::*;
 use crate::ir::MemFlags;
 use crate::ir::Opcode;
-use crate::ir::{ExternalName, LibCall};
+use crate::ir::{ExternalName, LibCall, Signature};
 use crate::isa;
 use crate::isa::aarch64::{inst::EmitState, inst::*};
 use crate::isa::unwind::UnwindInst;
@@ -155,6 +155,7 @@ fn saved_reg_stack_size(
    } else {
        vec_reg.len() & 1
    };
+    // FIXME: SVE: ABI is different to Neon, so do we treat all vec regs as Z-regs?
    let vec_save_bytes = (vec_reg.len() + vec_save_padding) * vec_reg_size;

    (int_save_bytes, vec_save_bytes)
@@ -365,9 +366,15 @@ impl ABIMachineSpec for AArch64MachineDeps {
                        RegClass::Int => xreg(*next_reg),
                        RegClass::Float => vreg(*next_reg),
                    };
+                    // Overlay Z-regs on V-regs for parameter passing.
+                    let ty = if param.value_type.is_dynamic_vector() {
+                        dynamic_to_fixed(param.value_type)
+                    } else {
+                        param.value_type
+                    };
                    ret.push(ABIArg::reg(
                        reg.to_real_reg().unwrap(),
-                        param.value_type,
+                        ty,
                        param.extension,
                        param.purpose,
                    ));
@@ -558,6 +565,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
    }

    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst {
+        // FIXME: Do something different for dynamic types?
        let mem = mem.into();
        Inst::LoadAddr { rd: into_reg, mem }
    }
@@ -931,6 +939,7 @@ impl ABIMachineSpec for AArch64MachineDeps {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        sig: &Signature,
        flags: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -938,7 +947,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
    ) -> SmallVec<[Inst; 16]> {
        let mut insts = SmallVec::new();
        let (clobbered_int, clobbered_vec) =
-            get_regs_restored_in_epilogue(call_conv, flags, clobbers);
+            get_regs_restored_in_epilogue(call_conv, flags, sig, clobbers);

        // Free the fixed frame if necessary.
        if fixed_frame_storage_size > 0 {
@@ -1146,11 +1155,12 @@ impl ABIMachineSpec for AArch64MachineDeps {
        insts
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, vector_size: u32) -> u32 {
+        assert_eq!(vector_size % 8, 0);
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
-            RegClass::Float => 2,
+            RegClass::Float => vector_size / 8,
        }
    }

@@ -1195,12 +1205,15 @@ impl ABIMachineSpec for AArch64MachineDeps {
    fn get_clobbered_callee_saves(
        call_conv: isa::CallConv,
        flags: &settings::Flags,
+        sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        let mut regs: Vec<Writable<RealReg>> = regs
            .iter()
            .cloned()
-            .filter(|r| is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), r.to_reg()))
+            .filter(|r| {
+                is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, r.to_reg())
+            })
            .collect();

        // Sort registers for deterministic code output. We can do an unstable
@@ -1235,7 +1248,12 @@ fn legal_type_for_machine(ty: Type) -> bool {

 /// Is the given register saved in the prologue if clobbered, i.e., is it a
 /// callee-save?
-fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r: RealReg) -> bool {
+fn is_reg_saved_in_prologue(
+    call_conv: isa::CallConv,
+    enable_pinned_reg: bool,
+    sig: &Signature,
+    r: RealReg,
+) -> bool {
    if call_conv.extends_baldrdash() {
        match r.class() {
            RegClass::Int => {
@@ -1249,6 +1267,14 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
        };
    }

+    // FIXME: We need to inspect whether a function is returning Z or P regs too.
+    let save_z_regs = sig
+        .params
+        .iter()
+        .filter(|p| p.value_type.is_dynamic_vector())
+        .count()
+        != 0;
+
    match r.class() {
        RegClass::Int => {
            // x19 - x28 inclusive are callee-saves.
@@ -1262,8 +1288,17 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
            }
        }
        RegClass::Float => {
-            // v8 - v15 inclusive are callee-saves.
-            r.hw_enc() >= 8 && r.hw_enc() <= 15
+            // If a subroutine takes at least one argument in scalable vector registers
+            // or scalable predicate registers, or if it is a function that returns
+            // results in such registers, it must ensure that the entire contents of
+            // z8-z23 are preserved across the call. In other cases it need only
+            // preserve the low 64 bits of z8-z15.
+            if save_z_regs {
+                r.hw_enc() >= 8 && r.hw_enc() <= 23
+            } else {
+                // v8 - v15 inclusive are callee-saves.
+                r.hw_enc() >= 8 && r.hw_enc() <= 15
+            }
        }
    }
 }
@@ -1274,12 +1309,13 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
 fn get_regs_restored_in_epilogue(
    call_conv: isa::CallConv,
    flags: &settings::Flags,
+    sig: &Signature,
    regs: &[Writable<RealReg>],
 ) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
    let mut int_saves = vec![];
    let mut vec_saves = vec![];
    for &reg in regs {
-        if is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), reg.to_reg()) {
+        if is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, reg.to_reg()) {
            match reg.to_reg().class() {
                RegClass::Int => int_saves.push(reg),
                RegClass::Float => vec_saves.push(reg),
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -919,6 +919,17 @@
    (Size64x2)
 ))

+(type DynamicVectorSize extern
+  (enum
+    (Size8x8xN)
+    (Size8x16xN)
+    (Size16x4xN)
+    (Size16x8xN)
+    (Size32x2xN)
+    (Size32x4xN)
+    (Size64x2xN)
+))
+
 ;; Helper for calculating the `VectorSize` corresponding to a type
 (decl vector_size (Type) VectorSize)
 (rule (vector_size (multi_lane 8 8)) (VectorSize.Size8x8))
@@ -928,6 +939,13 @@
 (rule (vector_size (multi_lane 32 2)) (VectorSize.Size32x2))
 (rule (vector_size (multi_lane 32 4)) (VectorSize.Size32x4))
 (rule (vector_size (multi_lane 64 2)) (VectorSize.Size64x2))
+(rule (vector_size (dynamic_lane 8 8)) (VectorSize.Size8x8))
+(rule (vector_size (dynamic_lane 8 16)) (VectorSize.Size8x16))
+(rule (vector_size (dynamic_lane 16 4)) (VectorSize.Size16x4))
+(rule (vector_size (dynamic_lane 16 8)) (VectorSize.Size16x8))
+(rule (vector_size (dynamic_lane 32 2)) (VectorSize.Size32x2))
+(rule (vector_size (dynamic_lane 32 4)) (VectorSize.Size32x4))
+(rule (vector_size (dynamic_lane 64 2)) (VectorSize.Size64x2))

 ;; A floating-point unit (FPU) operation with one arg.
 (type FPUOp1
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -706,12 +706,9 @@ impl VectorSize {
    /// Get the scalar operand size that corresponds to a lane of a vector with a certain size.
    pub fn lane_size(&self) -> ScalarSize {
        match self {
-            VectorSize::Size8x8 => ScalarSize::Size8,
-            VectorSize::Size8x16 => ScalarSize::Size8,
-            VectorSize::Size16x4 => ScalarSize::Size16,
-            VectorSize::Size16x8 => ScalarSize::Size16,
-            VectorSize::Size32x2 => ScalarSize::Size32,
-            VectorSize::Size32x4 => ScalarSize::Size32,
+            VectorSize::Size8x8 | VectorSize::Size8x16 => ScalarSize::Size8,
+            VectorSize::Size16x4 | VectorSize::Size16x8 => ScalarSize::Size16,
+            VectorSize::Size32x2 | VectorSize::Size32x4 => ScalarSize::Size32,
            VectorSize::Size64x2 => ScalarSize::Size64,
        }
    }
@@ -743,3 +740,18 @@ impl VectorSize {
        (q, size)
    }
 }
+
+pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
+    match ty {
+        I8X8XN => I8X8,
+        I8X16XN => I8X16,
+        I16X4XN => I16X4,
+        I16X8XN => I16X8,
+        I32X2XN => I32X2,
+        I32X4XN => I32X4,
+        I64X2XN => I64X2,
+        F32X4XN => F32X4,
+        F64X2XN => F64X2,
+        _ => unreachable!("unhandled type: {}", ty),
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -89,12 +89,12 @@ pub fn mem_finalize(
 //=============================================================================
 // Instructions and subcomponents: emission

-fn machreg_to_gpr(m: Reg) -> u32 {
+pub(crate) fn machreg_to_gpr(m: Reg) -> u32 {
    assert_eq!(m.class(), RegClass::Int);
    u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
 }

-fn machreg_to_vec(m: Reg) -> u32 {
+pub(crate) fn machreg_to_vec(m: Reg) -> u32 {
    assert_eq!(m.class(), RegClass::Float);
    u32::try_from(m.to_real_reg().unwrap().hw_enc()).unwrap()
 }
@@ -2259,7 +2259,7 @@ impl MachInstEmit for Inst {
                    VectorSize::Size16x8 => 0b00010,
                    VectorSize::Size32x4 => 0b00100,
                    VectorSize::Size64x2 => 0b01000,
-                    _ => unimplemented!(),
+                    _ => unimplemented!("Unexpected VectorSize: {:?}", size),
                };
                sink.put4(
                    0b010_01110000_00000_000011_00000_00000
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1194,6 +1194,7 @@ impl MachInst for Inst {
                assert!(ty.bits() <= 128);
                Ok((&[RegClass::Float], &[I8X16]))
            }
+            _ if ty.is_dynamic_vector() => Ok((&[RegClass::Float], &[I8X16])),
            IFLAGS | FFLAGS => Ok((&[RegClass::Int], &[I64])),
            _ => Err(CodegenError::Unsupported(format!(
                "Unexpected SSA-value type: {}",
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -332,12 +332,9 @@ pub fn show_vreg_element(reg: Reg, idx: u8, size: VectorSize) -> String {
    assert_eq!(RegClass::Float, reg.class());
    let s = show_reg(reg);
    let suffix = match size {
-        VectorSize::Size8x8 => ".b",
-        VectorSize::Size8x16 => ".b",
-        VectorSize::Size16x4 => ".h",
-        VectorSize::Size16x8 => ".h",
-        VectorSize::Size32x2 => ".s",
-        VectorSize::Size32x4 => ".s",
+        VectorSize::Size8x8 | VectorSize::Size8x16 => ".b",
+        VectorSize::Size16x4 | VectorSize::Size16x8 => ".h",
+        VectorSize::Size32x2 | VectorSize::Size32x4 => ".s",
        VectorSize::Size64x2 => ".d",
    };
    format!("{}{}[{}]", s, suffix, idx)
--- a/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
@@ -117,7 +117,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/aarch64/inst_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst_neon.isle
@@ -0,0 +1,8 @@
+
+;; Move helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(decl fpu_move_128 (Reg) Reg)
+(rule (fpu_move_128 src)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuMove128 dst src))))
+        (writable_reg_to_reg dst)))
+
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -81,6 +81,9 @@
 (rule (lower (has_type ty @ (multi_lane _ _) (iadd x y)))
      (add_vec x y (vector_size ty)))

+(rule (lower (has_type ty @ (dynamic_lane _ _) (iadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Add) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
 ;; `i128`
 (rule (lower (has_type $I128 (iadd x y)))
      (let
@@ -157,6 +160,8 @@
 ;; vectors
 (rule (lower (has_type ty @ (multi_lane _ _) (isub x y)))
      (sub_vec x y (vector_size ty)))
+(rule (lower (has_type ty @ (dynamic_lane _ _) (isub x y)))
+      (value_reg (sub_vec (put_in_reg x) (put_in_reg y) (vector_size ty))))

 ;; `i128`
 (rule (lower (has_type $I128 (isub x y)))
@@ -244,6 +249,10 @@
 (rule (lower (has_type (ty_vec128 ty @ (not_i64x2)) (imul x y)))
      (mul x y (vector_size ty)))

+;; Case for 'dynamic' i8x16, i16x8, and i32x4.
+(rule (lower (has_type ty @ (dynamic_lane _ _) (imul x y)))
+      (value_reg (vec_rrr (VecALUOp.Mul) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
 ;; Special lowering for i64x2.
 ;;
 ;; This I64X2 multiplication is performed with several 32-bit
--- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
@@ -0,0 +1,30 @@
+
+;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (iadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Add) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (isub x y)))
+      (value_reg (vec_rrr (VecALUOp.Sub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type (lane_fits_in_32 ty @ (dynamic_lane _ _)) (imul x y)))
+      (value_reg (vec_rrr (VecALUOp.Mul) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (fadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Fadd) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (fsub x y)))
+      (value_reg (vec_rrr (VecALUOp.Fsub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;; Rules for `dynamic_stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (dynamic_stack_addr stack_slot))
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (abi_dynamic_stackslot_addr dst stack_slot))))
+        (value_reg dst)))
+
+;;; Rules for `extract_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (extract_vector x 0))
+      (value_reg (fpu_move_128 (put_in_reg x))))
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -124,7 +124,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    insn,
                    &inputs[..],
                    outputs[0],
-                    |ctx, dst, elem_ty, mem| {
+                    |ctx, dst, mut elem_ty, mem| {
+                        if elem_ty.is_dynamic_vector() {
+                            elem_ty = dynamic_to_fixed(elem_ty);
+                        }
                        let rd = dst.only_reg().unwrap();
                        let is_float = ty_has_float_or_vec_representation(elem_ty);
                        ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
@@ -177,7 +180,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
            let off = ctx.data(insn).load_store_offset().unwrap();
-            let elem_ty = match op {
+            let mut elem_ty = match op {
                Opcode::Istore8 => I8,
                Opcode::Istore16 => I16,
                Opcode::Istore32 => I32,
@@ -200,6 +203,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    flags,
                });
            } else {
+                if elem_ty.is_dynamic_vector() {
+                    elem_ty = dynamic_to_fixed(elem_ty);
+                }
                let rd = dst.only_reg().unwrap();
                let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
                ctx.emit(match (ty_bits(elem_ty), is_float) {
@@ -231,12 +237,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let offset: i32 = offset.into();
-            let inst = ctx
-                .abi()
-                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
+            assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
+            let inst =
+                ctx.abi()
+                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
            ctx.emit(inst);
        }

+        Opcode::DynamicStackAddr => implemented_in_isle(ctx),
+
        Opcode::AtomicRmw => implemented_in_isle(ctx),

        Opcode::AtomicCas => implemented_in_isle(ctx),
@@ -249,7 +258,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            ctx.emit(Inst::Fence {});
        }

-        Opcode::StackLoad | Opcode::StackStore => {
+        Opcode::StackLoad
+        | Opcode::StackStore
+        | Opcode::DynamicStackStore
+        | Opcode::DynamicStackLoad => {
            panic!("Direct stack memory access not supported; should not be used by Wasm");
        }

@@ -684,7 +696,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let idx = *imm;
                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+                let input_ty = ctx.input_ty(insn, 0);
+                let size = VectorSize::from_ty(input_ty);
                let ty = ty.unwrap();

                if ty_has_int_representation(ty) {
@@ -730,7 +743,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Splat => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let size = VectorSize::from_ty(ty.unwrap());
+            let ty = ty.unwrap();
+            // TODO: Handle SVE Dup.
+            let ty = if ty.is_dynamic_vector() {
+                dynamic_to_fixed(ty)
+            } else {
+                ty
+            };
+            let size = VectorSize::from_ty(ty);

            if let Some((_, insn)) = maybe_input_insn_multi(
                ctx,
@@ -1284,7 +1304,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            if !ty.is_vector() {
+            if !ty.is_vector() && !ty.is_dynamic_vector() {
                let fpu_op = match op {
                    Opcode::Fadd => FPUOp2::Add,
                    Opcode::Fsub => FPUOp2::Sub,
@@ -1336,7 +1356,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

            debug_assert!(lane_type == F32 || lane_type == F64);

-            if ty.is_vector() {
+            if ty.is_vector() || ty.is_dynamic_vector() {
                let size = VectorSize::from_ty(ty);

                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
@@ -2015,7 +2035,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                .map_or(true, |insn| {
                    const_param_to_u128(ctx, insn).expect("Invalid immediate bytes") != 0
                });
-            let op = match (op, ty.unwrap()) {
+            let ty = ty.unwrap();
+            let ty = if ty.is_dynamic_vector() {
+                ty.dynamic_to_vector()
+                    .unwrap_or_else(|| panic!("Unsupported dynamic type: {}?", ty))
+            } else {
+                ty
+            };
+
+            let op = match (op, ty) {
                (Opcode::Snarrow, I8X16) => VecRRNarrowOp::Sqxtn16,
                (Opcode::Snarrow, I16X8) => VecRRNarrowOp::Sqxtn32,
                (Opcode::Snarrow, I32X4) => VecRRNarrowOp::Sqxtn64,
@@ -2057,7 +2085,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let (t, high_half) = match (ty.unwrap(), op) {
+            let ty = ty.unwrap();
+            let ty = if ty.is_dynamic_vector() {
+                ty.dynamic_to_vector()
+                    .unwrap_or_else(|| panic!("Unsupported dynamic type: {}?", ty))
+            } else {
+                ty
+            };
+            let (t, high_half) = match (ty, op) {
                (I16X8, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
                (I16X8, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
                (I16X8, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
@@ -2182,6 +2217,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }

+        Opcode::ExtractVector => implemented_in_isle(ctx),
+
        Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit | Opcode::IfcmpSp => {
            return Err(CodegenError::Unsupported(format!(
                "Unimplemented lowering: {}",
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -1,7 +1,7 @@
 //! ARM 64-bit Instruction Set Architecture.

 use crate::ir::condcodes::IntCC;
-use crate::ir::Function;
+use crate::ir::{Function, Type};
 use crate::isa::aarch64::settings as aarch64_settings;
 use crate::isa::{Builder as IsaBuilder, TargetIsa};
 use crate::machinst::{
@@ -57,7 +57,7 @@ impl AArch64Backend {
        flags: shared_settings::Flags,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(flags.clone());
-        let abi = Box::new(abi::AArch64ABICallee::new(func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::AArch64ABICallee::new(func, self)?);
        compile::compile::<AArch64Backend>(func, self, abi, &self.machine_env, emit_info)
    }
 }
@@ -76,7 +76,8 @@ impl TargetIsa for AArch64Backend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -87,7 +88,8 @@ impl TargetIsa for AArch64Backend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -109,6 +111,10 @@ impl TargetIsa for AArch64Backend {
        self.isa_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn unsigned_add_overflow_condition(&self) -> IntCC {
        // Unsigned `>=`; this corresponds to the carry flag set on aarch64, which happens on
        // overflow of an add.
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -196,7 +196,7 @@ pub struct TargetFrontendConfig {
 impl TargetFrontendConfig {
    /// Get the pointer type of this target.
    pub fn pointer_type(self) -> ir::Type {
-        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+        ir::Type::int(self.pointer_bits() as u16).unwrap()
    }

    /// Get the width of pointers on this target, in units of bits.
@@ -226,6 +226,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    /// Get the ISA-dependent flag values that were used to make this trait object.
    fn isa_flags(&self) -> Vec<settings::Value>;

+    /// Get the ISA-dependent maximum vector register size, in bytes.
+    fn dynamic_vector_bytes(&self, dynamic_ty: ir::Type) -> u32;
+
    /// Compile the given function.
    fn compile_function(
        &self,
@@ -311,7 +314,7 @@ impl<'a> dyn TargetIsa + 'a {

    /// Get the pointer type of this ISA.
    pub fn pointer_type(&self) -> ir::Type {
-        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+        ir::Type::int(self.pointer_bits() as u16).unwrap()
    }

    /// Get the width of pointers on this ISA.
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -61,6 +61,7 @@ use crate::ir;
 use crate::ir::condcodes::IntCC;
 use crate::ir::types;
 use crate::ir::MemFlags;
+use crate::ir::Signature;
 use crate::ir::Type;
 use crate::isa;
 use crate::isa::s390x::inst::*;
@@ -556,6 +557,7 @@ impl ABIMachineSpec for S390xMachineDeps {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        _: &Signature,
        _: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -633,7 +635,7 @@ impl ABIMachineSpec for S390xMachineDeps {
        unimplemented!("StructArgs not implemented for S390X yet");
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, _vector_scale: u32) -> u32 {
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
@@ -665,6 +667,7 @@ impl ABIMachineSpec for S390xMachineDeps {
    fn get_clobbered_callee_saves(
        call_conv: isa::CallConv,
        flags: &settings::Flags,
+        _sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        assert!(
@@ -688,7 +691,7 @@ impl ABIMachineSpec for S390xMachineDeps {
        _is_leaf: bool,
        _stack_args_size: u32,
        _num_clobbered_callee_saves: usize,
-        _fixed_frame_storage_size: u32,
+        _frame_storage_size: u32,
    ) -> bool {
        // The call frame set-up is handled by gen_clobber_save().
        false
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -1158,9 +1158,6 @@

 ;; Helpers for stack-slot addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(decl abi_stackslot_addr (WritableReg StackSlot Offset32) MInst)
-(extern constructor abi_stackslot_addr abi_stackslot_addr)
-
 (decl stack_addr_impl (Type StackSlot Offset32) Reg)
 (rule (stack_addr_impl ty stack_slot offset)
      (let ((dst WritableReg (temp_writable_reg ty))
--- a/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
@@ -148,7 +148,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
@@ -206,7 +206,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -2301,7 +2301,7 @@
 (decl lower_call_ret_arg (ABISig) InstOutput)
 (rule (lower_call_ret_arg (abi_no_ret_arg)) (output_none))
 (rule (lower_call_ret_arg abi @ (abi_ret_arg (abi_arg_only_slot slot)))
-      (let ((ret_arg Reg (load_addr (memarg_stack_off (abi_stack_arg_space abi) 0)))
+      (let ((ret_arg Reg (load_addr (memarg_stack_off (abi_sized_stack_arg_space abi) 0)))
            (_ Unit (copy_reg_to_arg_slot 0 slot ret_arg)))
        (output_none)))

@@ -2309,7 +2309,7 @@
 (decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput)
 (rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder))
 (rule (lower_call_rets abi (range_unwrap head tail) builder)
-      (let ((ret ValueRegs (copy_from_arg (abi_stack_arg_space abi) (abi_get_ret abi head)))
+      (let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head)))
            (_ Unit (output_builder_push builder ret)))
        (lower_call_rets abi tail builder)))

--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -197,7 +197,11 @@ impl LowerBackend for S390xBackend {
            | Opcode::SqmulRoundSat
            | Opcode::FvpromoteLow
            | Opcode::Fvdemote
-            | Opcode::IaddPairwise => {
+            | Opcode::IaddPairwise
+            | Opcode::DynamicStackLoad
+            | Opcode::DynamicStackStore
+            | Opcode::DynamicStackAddr
+            | Opcode::ExtractVector => {
                unreachable!(
                    "TODO: not yet implemented in ISLE: inst = `{}`, type = `{:?}`",
                    ctx.dfg().display_inst(ir_inst),
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -16,7 +16,7 @@ use crate::settings::Flags;
 use crate::{
    ir::{
        condcodes::*, immediates::*, types::*, AtomicRmwOp, Endianness, Inst, InstructionData,
-        MemFlags, Opcode, StackSlot, TrapCode, Value, ValueList,
+        MemFlags, Opcode, TrapCode, Value, ValueList,
    },
    isa::unwind::UnwindInst,
    machinst::{InsnOutput, LowerCtx, VCodeConstant, VCodeConstantData},
@@ -77,7 +77,7 @@ where
    }

    fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
-        let off = abi.stack_arg_space() + abi.stack_ret_space();
+        let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
        self.lower_ctx
            .abi()
            .accumulate_outgoing_args_size(off as u32);
@@ -531,17 +531,6 @@ where
        }
    }

-    #[inline]
-    fn abi_stackslot_addr(
-        &mut self,
-        dst: WritableReg,
-        stack_slot: StackSlot,
-        offset: Offset32,
-    ) -> MInst {
-        let offset = u32::try_from(i32::from(offset)).unwrap();
-        self.lower_ctx.abi().stackslot_addr(stack_slot, offset, dst)
-    }
-
    #[inline]
    fn inst_builder_new(&mut self) -> VecMInstBuilder {
        Cell::new(Vec::<MInst>::new())
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -1,7 +1,7 @@
 //! IBM Z 64-bit Instruction Set Architecture.

 use crate::ir::condcodes::IntCC;
-use crate::ir::Function;
+use crate::ir::{Function, Type};
 use crate::isa::s390x::settings as s390x_settings;
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv::RegisterMappingError;
@@ -58,7 +58,7 @@ impl S390xBackend {
        flags: shared_settings::Flags,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(flags.clone(), self.isa_flags.clone());
-        let abi = Box::new(abi::S390xABICallee::new(func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::S390xABICallee::new(func, self)?);
        compile::compile::<S390xBackend>(func, self, abi, &self.machine_env, emit_info)
    }
 }
@@ -77,7 +77,8 @@ impl TargetIsa for S390xBackend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -88,7 +89,8 @@ impl TargetIsa for S390xBackend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -110,6 +112,10 @@ impl TargetIsa for S390xBackend {
        self.isa_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn unsigned_add_overflow_condition(&self) -> IntCC {
        // The ADD LOGICAL family of instructions set the condition code
        // differently from normal comparisons, in a way that cannot be
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -1,7 +1,7 @@
 //! Implementation of the standard x64 ABI.

 use crate::ir::types::*;
-use crate::ir::{self, types, ExternalName, LibCall, MemFlags, Opcode, TrapCode, Type};
+use crate::ir::{self, types, ExternalName, LibCall, MemFlags, Opcode, Signature, TrapCode, Type};
 use crate::isa;
 use crate::isa::{unwind::UnwindInst, x64::inst::*, CallConv};
 use crate::machinst::abi_impl::*;
@@ -573,6 +573,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        sig: &Signature,
        flags: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -580,7 +581,8 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    ) -> SmallVec<[Self::I; 16]> {
        let mut insts = SmallVec::new();

-        let clobbered_callee_saves = Self::get_clobbered_callee_saves(call_conv, flags, clobbers);
+        let clobbered_callee_saves =
+            Self::get_clobbered_callee_saves(call_conv, flags, sig, clobbers);
        let stack_size = fixed_frame_storage_size + compute_clobber_size(&clobbered_callee_saves);

        // Restore regs by loading from offsets of RSP. RSP will be
@@ -722,11 +724,11 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        insts
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, vector_scale: u32) -> u32 {
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
-            RegClass::Float => 2,
+            RegClass::Float => vector_scale / 8,
        }
    }

@@ -771,6 +773,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    fn get_clobbered_callee_saves(
        call_conv: CallConv,
        flags: &settings::Flags,
+        _sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        let mut regs: Vec<Writable<RealReg>> = match call_conv {
@@ -805,7 +808,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        _is_leaf: bool,
        _stack_args_size: u32,
        _num_clobbered_callee_saves: usize,
-        _fixed_frame_storage_size: u32,
+        _frame_storage_size: u32,
    ) -> bool {
        true
    }
--- a/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -144,7 +144,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2169,6 +2169,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }

+        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
+
        Opcode::StackAddr => {
            let (stack_slot, offset) = match *ctx.data(insn) {
                InstructionData::StackLoad {
@@ -2180,9 +2182,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let offset: i32 = offset.into();
-            let inst = ctx
-                .abi()
-                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
+            let inst =
+                ctx.abi()
+                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
            ctx.emit(inst);
        }

@@ -2908,7 +2910,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        // Unimplemented opcodes below. These are not currently used by Wasm
        // lowering or other known embeddings, but should be either supported or
-        // removed eventually.
+        // removed eventually
+        Opcode::ExtractVector => {
+            unimplemented!("ExtractVector not supported");
+        }
+
        Opcode::Cls => unimplemented!("Cls not supported"),

        Opcode::Fma => unimplemented!("Fma not supported"),
@@ -2965,7 +2971,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            panic!("ALU+imm and ALU+carry ops should not appear here!");
        }

-        Opcode::StackLoad | Opcode::StackStore => {
+        Opcode::StackLoad
+        | Opcode::StackStore
+        | Opcode::DynamicStackStore
+        | Opcode::DynamicStackLoad => {
            panic!("Direct stack memory access not supported; should have been legalized");
        }

--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -3,7 +3,7 @@
 use self::inst::EmitInfo;

 use super::TargetIsa;
-use crate::ir::{condcodes::IntCC, Function};
+use crate::ir::{condcodes::IntCC, Function, Type};
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv;
 use crate::isa::x64::{inst::regs::create_reg_env_systemv, settings as x64_settings};
@@ -53,7 +53,7 @@ impl X64Backend {
        // This performs lowering to VCode, register-allocates the code, computes
        // block layout and finalizes branches. The result is ready for binary emission.
        let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone());
-        let abi = Box::new(abi::X64ABICallee::new(&func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::X64ABICallee::new(&func, self)?);
        compile::compile::<Self>(&func, self, abi, &self.reg_env, emit_info)
    }
 }
@@ -72,7 +72,8 @@ impl TargetIsa for X64Backend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -83,7 +84,8 @@ impl TargetIsa for X64Backend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -97,6 +99,10 @@ impl TargetIsa for X64Backend {
        self.x64_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn name(&self) -> &'static str {
        "x64"
    }
--- a/cranelift/codegen/src/legalizer/globalvalue.rs
+++ b/cranelift/codegen/src/legalizer/globalvalue.rs
@@ -28,9 +28,23 @@ pub fn expand_global_value(
            readonly,
        } => load_addr(inst, func, base, offset, global_type, readonly, isa),
        ir::GlobalValueData::Symbol { tls, .. } => symbol(inst, func, global_value, isa, tls),
+        ir::GlobalValueData::DynScaleTargetConst { vector_type } => {
+            const_vector_scale(inst, func, vector_type, isa)
+        }
    }
 }

+fn const_vector_scale(inst: ir::Inst, func: &mut ir::Function, ty: ir::Type, isa: &dyn TargetIsa) {
+    assert!(ty.bytes() <= 16);
+
+    // Use a minimum of 128-bits for the base type.
+    let base_bytes = std::cmp::max(ty.bytes(), 16);
+    let scale = (isa.dynamic_vector_bytes(ty) / base_bytes) as i64;
+    assert!(scale > 0);
+    let pos = FuncCursor::new(func).at_inst(inst);
+    pos.func.dfg.replace(inst).iconst(isa.pointer_type(), scale);
+}
+
 /// Expand a `global_value` instruction for a vmctx global.
 fn vmctx_addr(inst: ir::Inst, func: &mut ir::Function) {
    // Get the value representing the `vmctx` argument.
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -115,6 +115,41 @@ pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa:
                    mflags.set_aligned();
                    pos.func.dfg.replace(inst).store(mflags, arg, addr, 0);
                }
+                InstructionData::DynamicStackLoad {
+                    opcode: ir::Opcode::DynamicStackLoad,
+                    dynamic_stack_slot,
+                } => {
+                    let ty = pos.func.dfg.value_type(pos.func.dfg.first_result(inst));
+                    assert!(ty.is_dynamic_vector());
+                    let addr_ty = isa.pointer_type();
+
+                    let mut pos = FuncCursor::new(pos.func).at_inst(inst);
+                    pos.use_srcloc(inst);
+
+                    let addr = pos.ins().dynamic_stack_addr(addr_ty, dynamic_stack_slot);
+
+                    // Stack slots are required to be accessible and aligned.
+                    let mflags = MemFlags::trusted();
+                    pos.func.dfg.replace(inst).load(ty, mflags, addr, 0);
+                }
+                InstructionData::DynamicStackStore {
+                    opcode: ir::Opcode::DynamicStackStore,
+                    arg,
+                    dynamic_stack_slot,
+                } => {
+                    pos.use_srcloc(inst);
+                    let addr_ty = isa.pointer_type();
+                    let vector_ty = pos.func.dfg.value_type(arg);
+                    assert!(vector_ty.is_dynamic_vector());
+
+                    let addr = pos.ins().dynamic_stack_addr(addr_ty, dynamic_stack_slot);
+
+                    let mut mflags = MemFlags::new();
+                    // Stack slots are required to be accessible and aligned.
+                    mflags.set_notrap();
+                    mflags.set_aligned();
+                    pos.func.dfg.replace(inst).store(mflags, arg, addr, 0);
+                }
                InstructionData::TableAddr {
                    opcode: ir::Opcode::TableAddr,
                    table,
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -1,7 +1,7 @@
 //! ABI definitions.

 use crate::binemit::StackMap;
-use crate::ir::{Signature, StackSlot};
+use crate::ir::{DynamicStackSlot, Signature, StackSlot};
 use crate::isa::CallConv;
 use crate::machinst::*;
 use crate::settings;
@@ -47,11 +47,17 @@ pub trait ABICallee {
    /// Number of return values.
    fn num_retvals(&self) -> usize;

-    /// Number of stack slots (not spill slots).
-    fn num_stackslots(&self) -> usize;
+    /// Number of sized stack slots (not spill slots).
+    fn num_sized_stackslots(&self) -> usize;

-    /// The offsets of all stack slots (not spill slots) for debuginfo purposes.
-    fn stackslot_offsets(&self) -> &PrimaryMap<StackSlot, u32>;
+    /// The offsets of all sized stack slots (not spill slots) for debuginfo purposes.
+    fn sized_stackslot_offsets(&self) -> &PrimaryMap<StackSlot, u32>;
+
+    /// The offsets of all dynamic stack slots (not spill slots) for debuginfo purposes.
+    fn dynamic_stackslot_offsets(&self) -> &PrimaryMap<DynamicStackSlot, u32>;
+
+    /// All the defined dynamic types.
+    fn dynamic_type_size(&self, ty: Type) -> u32;

    /// Generate an instruction which copies an argument to a destination
    /// register.
@@ -101,8 +107,16 @@ pub trait ABICallee {
    /// Update with the clobbered registers, post-regalloc.
    fn set_clobbered(&mut self, clobbered: Vec<Writable<RealReg>>);

-    /// Get the address of a stackslot.
-    fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I;
+    /// Get the address of a sized stackslot.
+    fn sized_stackslot_addr(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        into_reg: Writable<Reg>,
+    ) -> Self::I;
+
+    /// Get the address of a dynamic stackslot.
+    fn dynamic_stackslot_addr(&self, slot: DynamicStackSlot, into_reg: Writable<Reg>) -> Self::I;

    /// Load from a spillslot.
    fn load_spillslot(
--- a/cranelift/codegen/src/machinst/abi_impl.rs
+++ b/cranelift/codegen/src/machinst/abi_impl.rs
@@ -126,7 +126,8 @@
 use super::abi::*;
 use crate::binemit::StackMap;
 use crate::ir::types::*;
-use crate::ir::{ArgumentExtension, ArgumentPurpose, StackSlot};
+use crate::ir::{ArgumentExtension, ArgumentPurpose, DynamicStackSlot, Signature, StackSlot};
+use crate::isa::TargetIsa;
 use crate::machinst::*;
 use crate::settings;
 use crate::CodegenResult;
@@ -138,6 +139,8 @@ use std::convert::TryFrom;
 use std::marker::PhantomData;
 use std::mem;

+use std::collections::HashMap;
+
 /// A location for (part of) an argument or return value. These "storage slots"
 /// are specified for each register-sized part of an argument.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -430,6 +433,7 @@ pub trait ABIMachineSpec {
    fn get_clobbered_callee_saves(
        call_conv: isa::CallConv,
        flags: &settings::Flags,
+        sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>>;

@@ -465,6 +469,7 @@ pub trait ABIMachineSpec {
    /// clobber-save sequence finished.
    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        sig: &Signature,
        flags: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -495,7 +500,7 @@ pub trait ABIMachineSpec {
    ) -> SmallVec<[Self::I; 8]>;

    /// Get the number of spillslots required for the given register-class.
-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32;
+    fn get_number_of_spillslots_for_value(rc: RegClass, target_vector_bytes: u32) -> u32;

    /// Get the current virtual-SP offset from an instruction-emission state.
    fn get_virtual_sp_offset_from_state(s: &<Self::I as MachInstEmit>::State) -> i64;
@@ -528,9 +533,9 @@ pub struct ABISig {
    /// pointer.
    rets: Vec<ABIArg>,
    /// Space on stack used to store arguments.
-    stack_arg_space: i64,
+    sized_stack_arg_space: i64,
    /// Space on stack used to store return values.
-    stack_ret_space: i64,
+    sized_stack_ret_space: i64,
    /// Index in `args` of the stack-return-value-area argument.
    stack_ret_arg: Option<usize>,
    /// Specific order for copying into arguments at callsites. We must be
@@ -550,15 +555,15 @@ impl ABISig {

        // Compute args and retvals from signature. Handle retvals first,
        // because we may need to add a return-area arg to the args.
-        let (rets, stack_ret_space, _) = M::compute_arg_locs(
+        let (rets, sized_stack_ret_space, _) = M::compute_arg_locs(
            sig.call_conv,
            flags,
            &sig.returns,
            ArgsOrRets::Rets,
            /* extra ret-area ptr = */ false,
        )?;
-        let need_stack_return_area = stack_ret_space > 0;
-        let (args, stack_arg_space, stack_ret_arg) = M::compute_arg_locs(
+        let need_stack_return_area = sized_stack_ret_space > 0;
+        let (args, sized_stack_arg_space, stack_ret_arg) = M::compute_arg_locs(
            sig.call_conv,
            flags,
            &sig.params,
@@ -586,8 +591,8 @@ impl ABISig {
            sig,
            args,
            rets,
-            stack_arg_space,
-            stack_ret_space,
+            sized_stack_arg_space,
+            sized_stack_ret_space,
            stack_ret_arg,
            copy_to_arg_order,
        );
@@ -595,8 +600,8 @@ impl ABISig {
        Ok(ABISig {
            args,
            rets,
-            stack_arg_space,
-            stack_ret_space,
+            sized_stack_arg_space,
+            sized_stack_ret_space,
            stack_ret_arg,
            copy_to_arg_order,
            call_conv: sig.call_conv,
@@ -666,8 +671,8 @@ impl ABISig {
    }

    /// Get total stack space required for arguments.
-    pub fn stack_arg_space(&self) -> i64 {
-        self.stack_arg_space
+    pub fn sized_stack_arg_space(&self) -> i64 {
+        self.sized_stack_arg_space
    }

    /// Get the number of return values expected.
@@ -681,8 +686,8 @@ impl ABISig {
    }

    /// Get total stack space required for return values.
-    pub fn stack_ret_space(&self) -> i64 {
-        self.stack_ret_space
+    pub fn sized_stack_ret_space(&self) -> i64 {
+        self.sized_stack_ret_space
    }

    /// Get information specifying how to pass the implicit pointer
@@ -699,15 +704,19 @@ pub struct ABICalleeImpl<M: ABIMachineSpec> {
    ir_sig: ir::Signature,
    /// Signature: arg and retval regs.
    sig: ABISig,
-    /// Offsets to each stackslot.
-    stackslots: PrimaryMap<StackSlot, u32>,
-    /// Total stack size of all stackslots.
+    /// Defined dynamic types.
+    dynamic_type_sizes: HashMap<Type, u32>,
+    /// Offsets to each dynamic stackslot.
+    dynamic_stackslots: PrimaryMap<DynamicStackSlot, u32>,
+    /// Offsets to each sized stackslot.
+    sized_stackslots: PrimaryMap<StackSlot, u32>,
+    /// Total stack size of all stackslots
    stackslots_size: u32,
    /// Stack size to be reserved for outgoing arguments.
    outgoing_args_size: u32,
    /// Clobbered registers, from regalloc.
    clobbered: Vec<Writable<RealReg>>,
-    /// Total number of spillslots, from regalloc.
+    /// Total number of spillslots, including for 'dynamic' types, from regalloc.
    spillslots: Option<usize>,
    /// Storage allocated for the fixed part of the stack frame.  This is
    /// usually the same as the total frame size below, except in the case
@@ -766,13 +775,10 @@ fn get_special_purpose_param_register(

 impl<M: ABIMachineSpec> ABICalleeImpl<M> {
    /// Create a new body ABI instance.
-    pub fn new(
-        f: &ir::Function,
-        flags: settings::Flags,
-        isa_flags: Vec<settings::Value>,
-    ) -> CodegenResult<Self> {
+    pub fn new(f: &ir::Function, isa: &dyn TargetIsa) -> CodegenResult<Self> {
        log::trace!("ABI: func signature {:?}", f.signature);

+        let flags = isa.flags().clone();
        let ir_sig = ensure_struct_return_ptr_is_returned(&f.signature);
        let sig = ABISig::from_func_sig::<M>(&ir_sig, &flags)?;

@@ -791,16 +797,41 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
            call_conv
        );

-        // Compute stackslot locations and total stackslot size.
-        let mut stack_offset: u32 = 0;
-        let mut stackslots = PrimaryMap::new();
-        for (stackslot, data) in f.stack_slots.iter() {
-            let off = stack_offset;
-            stack_offset += data.size;
+        // Compute sized stackslot locations and total stackslot size.
+        let mut sized_stack_offset: u32 = 0;
+        let mut sized_stackslots = PrimaryMap::new();
+        for (stackslot, data) in f.sized_stack_slots.iter() {
+            let off = sized_stack_offset;
+            sized_stack_offset += data.size;
            let mask = M::word_bytes() - 1;
-            stack_offset = (stack_offset + mask) & !mask;
-            debug_assert_eq!(stackslot.as_u32() as usize, stackslots.len());
-            stackslots.push(off);
+            sized_stack_offset = (sized_stack_offset + mask) & !mask;
+            debug_assert_eq!(stackslot.as_u32() as usize, sized_stackslots.len());
+            sized_stackslots.push(off);
+        }
+
+        // Compute dynamic stackslot locations and total stackslot size.
+        let mut dynamic_stackslots = PrimaryMap::new();
+        let mut dynamic_stack_offset: u32 = sized_stack_offset;
+        for (stackslot, data) in f.dynamic_stack_slots.iter() {
+            debug_assert_eq!(stackslot.as_u32() as usize, dynamic_stackslots.len());
+            let off = dynamic_stack_offset;
+            let ty = f
+                .get_concrete_dynamic_ty(data.dyn_ty)
+                .unwrap_or_else(|| panic!("invalid dynamic vector type: {}", data.dyn_ty));
+            dynamic_stack_offset += isa.dynamic_vector_bytes(ty);
+            let mask = M::word_bytes() - 1;
+            dynamic_stack_offset = (dynamic_stack_offset + mask) & !mask;
+            dynamic_stackslots.push(off);
+        }
+        let stackslots_size = dynamic_stack_offset;
+
+        let mut dynamic_type_sizes = HashMap::with_capacity(f.dfg.dynamic_types.len());
+        for (dyn_ty, _data) in f.dfg.dynamic_types.iter() {
+            let ty = f
+                .get_concrete_dynamic_ty(dyn_ty)
+                .unwrap_or_else(|| panic!("invalid dynamic vector type: {}", dyn_ty));
+            let size = isa.dynamic_vector_bytes(ty);
+            dynamic_type_sizes.insert(ty, size);
        }

        // Figure out what instructions, if any, will be needed to check the
@@ -827,8 +858,10 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
        Ok(Self {
            ir_sig,
            sig,
-            stackslots,
-            stackslots_size: stack_offset,
+            dynamic_stackslots,
+            dynamic_type_sizes,
+            sized_stackslots,
+            stackslots_size,
            outgoing_args_size: 0,
            clobbered: vec![],
            spillslots: None,
@@ -837,7 +870,7 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
            ret_area_ptr: None,
            call_conv,
            flags,
-            isa_flags,
+            isa_flags: isa.isa_flags(),
            is_leaf: f.is_leaf(),
            stack_limit,
            probestack_min_frame,
@@ -1060,12 +1093,16 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        self.sig.rets.len()
    }

-    fn num_stackslots(&self) -> usize {
-        self.stackslots.len()
+    fn num_sized_stackslots(&self) -> usize {
+        self.sized_stackslots.len()
    }

-    fn stackslot_offsets(&self) -> &PrimaryMap<StackSlot, u32> {
-        &self.stackslots
+    fn sized_stackslot_offsets(&self) -> &PrimaryMap<StackSlot, u32> {
+        &self.sized_stackslots
+    }
+
+    fn dynamic_stackslot_offsets(&self) -> &PrimaryMap<DynamicStackSlot, u32> {
+        &self.dynamic_stackslots
    }

    fn gen_copy_arg_to_regs(
@@ -1256,15 +1293,34 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        self.clobbered = clobbered;
    }

-    /// Produce an instruction that computes a stackslot address.
-    fn stackslot_addr(&self, slot: StackSlot, offset: u32, into_reg: Writable<Reg>) -> Self::I {
+    /// Produce an instruction that computes a sized stackslot address.
+    fn sized_stackslot_addr(
+        &self,
+        slot: StackSlot,
+        offset: u32,
+        into_reg: Writable<Reg>,
+    ) -> Self::I {
        // Offset from beginning of stackslot area, which is at nominal SP (see
        // [MemArg::NominalSPOffset] for more details on nominal SP tracking).
-        let stack_off = self.stackslots[slot] as i64;
+        let stack_off = self.sized_stackslots[slot] as i64;
        let sp_off: i64 = stack_off + (offset as i64);
        M::gen_get_stack_addr(StackAMode::NominalSPOffset(sp_off, I8), into_reg, I8)
    }

+    /// Produce an instruction that computes a dynamic stackslot address.
+    fn dynamic_stackslot_addr(&self, slot: DynamicStackSlot, into_reg: Writable<Reg>) -> Self::I {
+        let stack_off = self.dynamic_stackslots[slot] as i64;
+        M::gen_get_stack_addr(
+            StackAMode::NominalSPOffset(stack_off, I64X2XN),
+            into_reg,
+            I64X2XN,
+        )
+    }
+
+    fn dynamic_type_size(&self, ty: Type) -> u32 {
+        self.dynamic_type_sizes[&ty]
+    }
+
    /// Load from a spillslot.
    fn load_spillslot(
        &self,
@@ -1339,8 +1395,12 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        }
        let mask = M::stack_align(self.call_conv) - 1;
        let total_stacksize = (total_stacksize + mask) & !mask; // 16-align the stack.
-        let clobbered_callee_saves =
-            M::get_clobbered_callee_saves(self.call_conv, &self.flags, &self.clobbered);
+        let clobbered_callee_saves = M::get_clobbered_callee_saves(
+            self.call_conv,
+            &self.flags,
+            self.signature(),
+            &self.clobbered,
+        );
        let mut insts = smallvec![];

        if !self.call_conv.extends_baldrdash() {
@@ -1408,6 +1468,7 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        // Restore clobbered registers.
        insts.extend(M::gen_clobber_restore(
            self.call_conv,
+            self.signature(),
            &self.flags,
            &self.clobbered,
            self.fixed_frame_storage_size,
@@ -1441,11 +1502,21 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
    }

    fn stack_args_size(&self) -> u32 {
-        self.sig.stack_arg_space as u32
+        self.sig.sized_stack_arg_space as u32
    }

    fn get_spillslot_size(&self, rc: RegClass) -> u32 {
-        M::get_number_of_spillslots_for_value(rc)
+        let max = if self.dynamic_type_sizes.len() == 0 {
+            16
+        } else {
+            *self
+                .dynamic_type_sizes
+                .iter()
+                .max_by(|x, y| x.1.cmp(&y.1))
+                .map(|(_k, v)| v)
+                .unwrap()
+        };
+        M::get_number_of_spillslots_for_value(rc, max)
    }

    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg) -> Self::I {
@@ -1586,17 +1657,17 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
    }

    fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
-        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        let off = self.sig.sized_stack_arg_space + self.sig.sized_stack_ret_space;
        ctx.abi().accumulate_outgoing_args_size(off as u32);
    }

    fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
-        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        let off = self.sig.sized_stack_arg_space + self.sig.sized_stack_ret_space;
        adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ true)
    }

    fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
-        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        let off = self.sig.sized_stack_arg_space + self.sig.sized_stack_ret_space;
        adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ false)
    }

@@ -1720,7 +1791,7 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
                            ctx.emit(M::gen_move(*into_reg, Reg::from(reg), ty));
                        }
                        &ABIArgSlot::Stack { offset, ty, .. } => {
-                            let ret_area_base = self.sig.stack_arg_space;
+                            let ret_area_base = self.sig.sized_stack_arg_space;
                            ctx.emit(M::gen_load_stack(
                                StackAMode::SPOffset(offset + ret_area_base, ty),
                                *into_reg,
@@ -1744,7 +1815,7 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
        let word_type = M::word_type();
        if let Some(i) = self.sig.stack_ret_arg {
            let rd = ctx.alloc_tmp(word_type).only_reg().unwrap();
-            let ret_area_base = self.sig.stack_arg_space;
+            let ret_area_base = self.sig.sized_stack_arg_space;
            ctx.emit(M::gen_get_stack_addr(
                StackAMode::SPOffset(ret_area_base, I8),
                rd,
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -6,7 +6,9 @@ use smallvec::SmallVec;
 use std::cell::Cell;

 pub use super::MachLabel;
-pub use crate::ir::{ArgumentExtension, ExternalName, FuncRef, GlobalValue, SigRef};
+pub use crate::ir::{
+    ArgumentExtension, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, SigRef, StackSlot,
+};
 pub use crate::isa::unwind::UnwindInst;
 pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable};

@@ -243,7 +245,18 @@ macro_rules! isle_prelude_methods {

        #[inline]
        fn fits_in_32(&mut self, ty: Type) -> Option<Type> {
-            if ty.bits() <= 32 {
+            if ty.bits() <= 32 && !ty.is_dynamic_vector() {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn lane_fits_in_32(&mut self, ty: Type) -> Option<Type> {
+            if !ty.is_vector() && !ty.is_dynamic_vector() {
+                None
+            } else if ty.lane_type().bits() <= 32 {
                Some(ty)
            } else {
                None
@@ -252,7 +265,7 @@ macro_rules! isle_prelude_methods {

        #[inline]
        fn fits_in_64(&mut self, ty: Type) -> Option<Type> {
-            if ty.bits() <= 64 {
+            if ty.bits() <= 64 && !ty.is_dynamic_vector() {
                Some(ty)
            } else {
                None
@@ -418,6 +431,36 @@ macro_rules! isle_prelude_methods {
            }
        }

+        #[inline]
+        fn dynamic_lane(&mut self, ty: Type) -> Option<(u32, u32)> {
+            if ty.is_dynamic_vector() {
+                Some((ty.lane_bits(), ty.min_lane_count()))
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn dynamic_int_lane(&mut self, ty: Type) -> Option<u32> {
+            if ty.is_dynamic_vector() && crate::machinst::ty_has_int_representation(ty.lane_type())
+            {
+                Some(ty.lane_bits())
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn dynamic_fp_lane(&mut self, ty: Type) -> Option<u32> {
+            if ty.is_dynamic_vector()
+                && crate::machinst::ty_has_float_or_vec_representation(ty.lane_type())
+            {
+                Some(ty.lane_bits())
+            } else {
+                None
+            }
+        }
+
        #[inline]
        fn def_inst(&mut self, val: Value) -> Option<Inst> {
            self.lower_ctx.dfg().value_def(val).inst()
@@ -635,12 +678,12 @@ macro_rules! isle_prelude_methods {
            }
        }

-        fn abi_stack_arg_space(&mut self, abi: &ABISig) -> i64 {
-            abi.stack_arg_space()
+        fn abi_sized_stack_arg_space(&mut self, abi: &ABISig) -> i64 {
+            abi.sized_stack_arg_space()
        }

-        fn abi_stack_ret_space(&mut self, abi: &ABISig) -> i64 {
-            abi.stack_ret_space()
+        fn abi_sized_stack_ret_space(&mut self, abi: &ABISig) -> i64 {
+            abi.sized_stack_ret_space()
        }

        fn abi_arg_only_slot(&mut self, arg: &ABIArg) -> Option<ABIArgSlot> {
@@ -656,6 +699,31 @@ macro_rules! isle_prelude_methods {
            }
        }

+        fn abi_stackslot_addr(
+            &mut self,
+            dst: WritableReg,
+            stack_slot: StackSlot,
+            offset: Offset32,
+        ) -> MInst {
+            let offset = u32::try_from(i32::from(offset)).unwrap();
+            self.lower_ctx
+                .abi()
+                .sized_stackslot_addr(stack_slot, offset, dst)
+        }
+
+        fn abi_dynamic_stackslot_addr(
+            &mut self,
+            dst: WritableReg,
+            stack_slot: DynamicStackSlot,
+        ) -> MInst {
+            assert!(self
+                .lower_ctx
+                .abi()
+                .dynamic_stackslot_offsets()
+                .is_valid(stack_slot));
+            self.lower_ctx.abi().dynamic_stackslot_addr(stack_slot, dst)
+        }
+
        fn real_reg_to_reg(&mut self, reg: RealReg) -> Reg {
            Reg::from(reg)
        }
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -45,7 +45,7 @@
 //! ```

 use crate::binemit::{Addend, CodeInfo, CodeOffset, Reloc, StackMap};
-use crate::ir::{SourceLoc, StackSlot, Type};
+use crate::ir::{DynamicStackSlot, SourceLoc, StackSlot, Type};
 use crate::result::CodegenResult;
 use crate::settings::Flags;
 use crate::value_label::ValueLabelsRanges;
@@ -282,7 +282,9 @@ pub struct MachCompileResult {
    /// Debug info: value labels to registers/stackslots at code offsets.
    pub value_labels_ranges: ValueLabelsRanges,
    /// Debug info: stackslots to stack pointer offsets.
-    pub stackslot_offsets: PrimaryMap<StackSlot, u32>,
+    pub sized_stackslot_offsets: PrimaryMap<StackSlot, u32>,
+    /// Debug info: stackslots to stack pointer offsets.
+    pub dynamic_stackslot_offsets: PrimaryMap<DynamicStackSlot, u32>,
    /// Basic-block layout info: block start offsets.
    ///
    /// This info is generated only if the `machine_code_cfg_info`
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -19,7 +19,9 @@

 use crate::fx::FxHashMap;
 use crate::fx::FxHashSet;
-use crate::ir::{self, types, Constant, ConstantData, LabelValueLoc, SourceLoc, ValueLabel};
+use crate::ir::{
+    self, types, Constant, ConstantData, DynamicStackSlot, LabelValueLoc, SourceLoc, ValueLabel,
+};
 use crate::machinst::*;
 use crate::timing;
 use crate::ValueLocRange;
@@ -207,8 +209,11 @@ pub struct EmitResult<I: VCodeInst> {
    /// epilogue(s), and makes use of the regalloc results.
    pub disasm: Option<String>,

-    /// Offsets of stackslots.
-    pub stackslot_offsets: PrimaryMap<StackSlot, u32>,
+    /// Offsets of sized stackslots.
+    pub sized_stackslot_offsets: PrimaryMap<StackSlot, u32>,
+
+    /// Offsets of dynamic stackslots.
+    pub dynamic_stackslot_offsets: PrimaryMap<DynamicStackSlot, u32>,

    /// Value-labels information (debug metadata).
    pub value_labels_ranges: ValueLabelsRanges,
@@ -1038,7 +1043,8 @@ impl<I: VCodeInst> VCode<I> {
            inst_offsets,
            func_body_len,
            disasm: if want_disasm { Some(disasm) } else { None },
-            stackslot_offsets: self.abi.stackslot_offsets().clone(),
+            sized_stackslot_offsets: self.abi.sized_stackslot_offsets().clone(),
+            dynamic_stackslot_offsets: self.abi.dynamic_stackslot_offsets().clone(),
            value_labels_ranges,
            frame_size,
        }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -256,6 +256,8 @@
 (extern const $F32X4 Type)
 (extern const $F64X2 Type)

+(extern const $I32X4XN Type)
+
 ;; Get the bit width of a given type.
 (decl pure ty_bits (Type) u8)
 (extern constructor ty_bits ty_bits)
@@ -290,6 +292,10 @@
 (decl fits_in_32 (Type) Type)
 (extern extractor fits_in_32 fits_in_32)

+;; An extractor that only matches types that can fit in 32 bits.
+(decl lane_fits_in_32 (Type) Type)
+(extern extractor lane_fits_in_32 lane_fits_in_32)
+
 ;; An extractor that only matches types that can fit in 64 bits.
 (decl fits_in_64 (Type) Type)
 (extern extractor fits_in_64 fits_in_64)
@@ -433,6 +439,21 @@
 (decl multi_lane (u32 u32) Type)
 (extern extractor multi_lane multi_lane)

+;; Match a dynamic-lane type, extracting (# bits per lane) from the given
+;; type.
+(decl dynamic_lane (u32 u32) Type)
+(extern extractor dynamic_lane dynamic_lane)
+
+;; Match a dynamic-lane integer type, extracting (# bits per lane) from the given
+;; type.
+(decl dynamic_int_lane (u32) Type)
+(extern extractor dynamic_int_lane dynamic_int_lane)
+
+;; Match a dynamic-lane floating point type, extracting (# bits per lane)
+;; from the given type.
+(decl dynamic_fp_lane (u32) Type)
+(extern extractor dynamic_fp_lane dynamic_fp_lane)
+
 ;; Match the instruction that defines the given value, if any.
 (decl def_inst (Inst) Value)
 (extern extractor def_inst def_inst)
@@ -727,12 +748,20 @@
 (extern extractor abi_no_ret_arg abi_no_ret_arg)

 ;; Size of the argument area.
-(decl abi_stack_arg_space (ABISig) i64)
-(extern constructor abi_stack_arg_space abi_stack_arg_space)
+(decl abi_sized_stack_arg_space (ABISig) i64)
+(extern constructor abi_sized_stack_arg_space abi_sized_stack_arg_space)

 ;; Size of the return-value area.
-(decl abi_stack_ret_space (ABISig) i64)
-(extern constructor abi_stack_ret_space abi_stack_ret_space)
+(decl abi_sized_stack_ret_space (ABISig) i64)
+(extern constructor abi_sized_stack_ret_space abi_sized_stack_ret_space)
+
+;; StackSlot addr
+(decl abi_stackslot_addr (WritableReg StackSlot Offset32) MInst)
+(extern constructor abi_stackslot_addr abi_stackslot_addr)
+
+;; DynamicStackSlot addr
+(decl abi_dynamic_stackslot_addr (WritableReg DynamicStackSlot) MInst)
+(extern constructor abi_dynamic_stackslot_addr abi_dynamic_stackslot_addr)

 ;; Extractor to detect the special case where an argument or
 ;; return value only requires a single slot to be passed.
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -65,8 +65,8 @@ use crate::ir;
 use crate::ir::entities::AnyEntity;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionFormat, ResolvedConstraint};
 use crate::ir::{
-    types, ArgumentPurpose, Block, Constant, FuncRef, Function, GlobalValue, Inst, JumpTable,
-    Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
+    types, ArgumentPurpose, Block, Constant, DynamicStackSlot, FuncRef, Function, GlobalValue,
+    Inst, JumpTable, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
 };
 use crate::isa::TargetIsa;
 use crate::iterators::IteratorExtras;
@@ -681,6 +681,14 @@ impl<'a> Verifier<'a> {
            StackLoad { stack_slot, .. } | StackStore { stack_slot, .. } => {
                self.verify_stack_slot(inst, stack_slot, errors)?;
            }
+            DynamicStackLoad {
+                dynamic_stack_slot, ..
+            }
+            | DynamicStackStore {
+                dynamic_stack_slot, ..
+            } => {
+                self.verify_dynamic_stack_slot(inst, dynamic_stack_slot, errors)?;
+            }
            UnaryGlobalValue { global_value, .. } => {
                self.verify_global_value(inst, global_value, errors)?;
            }
@@ -819,7 +827,7 @@ impl<'a> Verifier<'a> {
        ss: StackSlot,
        errors: &mut VerifierErrors,
    ) -> VerifierStepResult<()> {
-        if !self.func.stack_slots.is_valid(ss) {
+        if !self.func.sized_stack_slots.is_valid(ss) {
            errors.nonfatal((
                inst,
                self.context(inst),
@@ -830,6 +838,23 @@ impl<'a> Verifier<'a> {
        }
    }

+    fn verify_dynamic_stack_slot(
+        &self,
+        inst: Inst,
+        ss: DynamicStackSlot,
+        errors: &mut VerifierErrors,
+    ) -> VerifierStepResult<()> {
+        if !self.func.dynamic_stack_slots.is_valid(ss) {
+            errors.nonfatal((
+                inst,
+                self.context(inst),
+                format!("invalid dynamic stack slot {}", ss),
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
    fn verify_global_value(
        &self,
        inst: Inst,
--- a/cranelift/codegen/src/write.rs
+++ b/cranelift/codegen/src/write.rs
@@ -41,7 +41,12 @@ pub trait FuncWriter {
    fn super_preamble(&mut self, w: &mut dyn Write, func: &Function) -> Result<bool, fmt::Error> {
        let mut any = false;

-        for (ss, slot) in func.stack_slots.iter() {
+        for (ss, slot) in func.dynamic_stack_slots.iter() {
+            any = true;
+            self.write_entity_definition(w, func, ss.into(), slot)?;
+        }
+
+        for (ss, slot) in func.sized_stack_slots.iter() {
            any = true;
            self.write_entity_definition(w, func, ss.into(), slot)?;
        }
@@ -493,6 +498,14 @@ pub fn write_operands(w: &mut dyn Write, dfg: &DataFlowGraph, inst: Inst) -> fmt
            offset,
            ..
        } => write!(w, " {}, {}{}", arg, stack_slot, offset),
+        DynamicStackLoad {
+            dynamic_stack_slot, ..
+        } => write!(w, " {}", dynamic_stack_slot),
+        DynamicStackStore {
+            arg,
+            dynamic_stack_slot,
+            ..
+        } => write!(w, " {}, {}", arg, dynamic_stack_slot),
        HeapAddr { heap, arg, imm, .. } => write!(w, " {}, {}, {}", heap, arg, imm),
        TableAddr { table, arg, .. } => write!(w, " {}, {}", table, arg),
        Load {
@@ -570,7 +583,7 @@ mod tests {
        f.name = ExternalName::testcase("foo");
        assert_eq!(f.to_string(), "function %foo() fast {\n}\n");

-        f.create_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 4));
+        f.create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 4));
        assert_eq!(
            f.to_string(),
            "function %foo() fast {\n    ss0 = explicit_slot 4\n}\n"