[RFC] Dynamic Vector Support (#4200)

Introduce a new concept in the IR that allows a producer to create dynamic vector types. An IR function can now contain global value(s) that represent a dynamic scaling factor, for a given fixed-width vector type. A dynamic type is then created by 'multiplying' the corresponding global value with a fixed-width type. These new types can be used just like the existing types and the type system has a set of hard-coded dynamic types, such as I32X4XN, which the user defined types map onto. The dynamic types are also used explicitly to create dynamic stack slots, which have no set size like their existing counterparts. New IR instructions are added to access these new stack entities. Currently, during codegen, the dynamic scaling factor has to be lowered to a constant so the dynamic slots do eventually have a compile-time known size, as do spill slots. The current lowering for aarch64 just targets Neon, using a dynamic scale of 1. Copyright (c) 2022, Arm Limited.
2022-07-07 20:54:39 +01:00
parent 9ae060a12a
commit 9c43749dfe
69 changed files with 2422 additions and 294 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -5,7 +5,7 @@ use crate::ir::types;
 use crate::ir::types::*;
 use crate::ir::MemFlags;
 use crate::ir::Opcode;
-use crate::ir::{ExternalName, LibCall};
+use crate::ir::{ExternalName, LibCall, Signature};
 use crate::isa;
 use crate::isa::aarch64::{inst::EmitState, inst::*};
 use crate::isa::unwind::UnwindInst;
@@ -155,6 +155,7 @@ fn saved_reg_stack_size(
    } else {
        vec_reg.len() & 1
    };
+    // FIXME: SVE: ABI is different to Neon, so do we treat all vec regs as Z-regs?
    let vec_save_bytes = (vec_reg.len() + vec_save_padding) * vec_reg_size;

    (int_save_bytes, vec_save_bytes)
@@ -365,9 +366,15 @@ impl ABIMachineSpec for AArch64MachineDeps {
                        RegClass::Int => xreg(*next_reg),
                        RegClass::Float => vreg(*next_reg),
                    };
+                    // Overlay Z-regs on V-regs for parameter passing.
+                    let ty = if param.value_type.is_dynamic_vector() {
+                        dynamic_to_fixed(param.value_type)
+                    } else {
+                        param.value_type
+                    };
                    ret.push(ABIArg::reg(
                        reg.to_real_reg().unwrap(),
-                        param.value_type,
+                        ty,
                        param.extension,
                        param.purpose,
                    ));
@@ -558,6 +565,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
    }

    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst {
+        // FIXME: Do something different for dynamic types?
        let mem = mem.into();
        Inst::LoadAddr { rd: into_reg, mem }
    }
@@ -931,6 +939,7 @@ impl ABIMachineSpec for AArch64MachineDeps {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        sig: &Signature,
        flags: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -938,7 +947,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
    ) -> SmallVec<[Inst; 16]> {
        let mut insts = SmallVec::new();
        let (clobbered_int, clobbered_vec) =
-            get_regs_restored_in_epilogue(call_conv, flags, clobbers);
+            get_regs_restored_in_epilogue(call_conv, flags, sig, clobbers);

        // Free the fixed frame if necessary.
        if fixed_frame_storage_size > 0 {
@@ -1146,11 +1155,12 @@ impl ABIMachineSpec for AArch64MachineDeps {
        insts
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, vector_size: u32) -> u32 {
+        assert_eq!(vector_size % 8, 0);
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
-            RegClass::Float => 2,
+            RegClass::Float => vector_size / 8,
        }
    }

@@ -1195,12 +1205,15 @@ impl ABIMachineSpec for AArch64MachineDeps {
    fn get_clobbered_callee_saves(
        call_conv: isa::CallConv,
        flags: &settings::Flags,
+        sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        let mut regs: Vec<Writable<RealReg>> = regs
            .iter()
            .cloned()
-            .filter(|r| is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), r.to_reg()))
+            .filter(|r| {
+                is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, r.to_reg())
+            })
            .collect();

        // Sort registers for deterministic code output. We can do an unstable
@@ -1235,7 +1248,12 @@ fn legal_type_for_machine(ty: Type) -> bool {

 /// Is the given register saved in the prologue if clobbered, i.e., is it a
 /// callee-save?
-fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r: RealReg) -> bool {
+fn is_reg_saved_in_prologue(
+    call_conv: isa::CallConv,
+    enable_pinned_reg: bool,
+    sig: &Signature,
+    r: RealReg,
+) -> bool {
    if call_conv.extends_baldrdash() {
        match r.class() {
            RegClass::Int => {
@@ -1249,6 +1267,14 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
        };
    }

+    // FIXME: We need to inspect whether a function is returning Z or P regs too.
+    let save_z_regs = sig
+        .params
+        .iter()
+        .filter(|p| p.value_type.is_dynamic_vector())
+        .count()
+        != 0;
+
    match r.class() {
        RegClass::Int => {
            // x19 - x28 inclusive are callee-saves.
@@ -1262,8 +1288,17 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
            }
        }
        RegClass::Float => {
-            // v8 - v15 inclusive are callee-saves.
-            r.hw_enc() >= 8 && r.hw_enc() <= 15
+            // If a subroutine takes at least one argument in scalable vector registers
+            // or scalable predicate registers, or if it is a function that returns
+            // results in such registers, it must ensure that the entire contents of
+            // z8-z23 are preserved across the call. In other cases it need only
+            // preserve the low 64 bits of z8-z15.
+            if save_z_regs {
+                r.hw_enc() >= 8 && r.hw_enc() <= 23
+            } else {
+                // v8 - v15 inclusive are callee-saves.
+                r.hw_enc() >= 8 && r.hw_enc() <= 15
+            }
        }
    }
 }
@@ -1274,12 +1309,13 @@ fn is_reg_saved_in_prologue(call_conv: isa::CallConv, enable_pinned_reg: bool, r
 fn get_regs_restored_in_epilogue(
    call_conv: isa::CallConv,
    flags: &settings::Flags,
+    sig: &Signature,
    regs: &[Writable<RealReg>],
 ) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
    let mut int_saves = vec![];
    let mut vec_saves = vec![];
    for &reg in regs {
-        if is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), reg.to_reg()) {
+        if is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, reg.to_reg()) {
            match reg.to_reg().class() {
                RegClass::Int => int_saves.push(reg),
                RegClass::Float => vec_saves.push(reg),
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -919,6 +919,17 @@
    (Size64x2)
 ))

+(type DynamicVectorSize extern
+  (enum
+    (Size8x8xN)
+    (Size8x16xN)
+    (Size16x4xN)
+    (Size16x8xN)
+    (Size32x2xN)
+    (Size32x4xN)
+    (Size64x2xN)
+))
+
 ;; Helper for calculating the `VectorSize` corresponding to a type
 (decl vector_size (Type) VectorSize)
 (rule (vector_size (multi_lane 8 8)) (VectorSize.Size8x8))
@@ -928,6 +939,13 @@
 (rule (vector_size (multi_lane 32 2)) (VectorSize.Size32x2))
 (rule (vector_size (multi_lane 32 4)) (VectorSize.Size32x4))
 (rule (vector_size (multi_lane 64 2)) (VectorSize.Size64x2))
+(rule (vector_size (dynamic_lane 8 8)) (VectorSize.Size8x8))
+(rule (vector_size (dynamic_lane 8 16)) (VectorSize.Size8x16))
+(rule (vector_size (dynamic_lane 16 4)) (VectorSize.Size16x4))
+(rule (vector_size (dynamic_lane 16 8)) (VectorSize.Size16x8))
+(rule (vector_size (dynamic_lane 32 2)) (VectorSize.Size32x2))
+(rule (vector_size (dynamic_lane 32 4)) (VectorSize.Size32x4))
+(rule (vector_size (dynamic_lane 64 2)) (VectorSize.Size64x2))

 ;; A floating-point unit (FPU) operation with one arg.
 (type FPUOp1
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -706,12 +706,9 @@ impl VectorSize {
    /// Get the scalar operand size that corresponds to a lane of a vector with a certain size.
    pub fn lane_size(&self) -> ScalarSize {
        match self {
-            VectorSize::Size8x8 => ScalarSize::Size8,
-            VectorSize::Size8x16 => ScalarSize::Size8,
-            VectorSize::Size16x4 => ScalarSize::Size16,
-            VectorSize::Size16x8 => ScalarSize::Size16,
-            VectorSize::Size32x2 => ScalarSize::Size32,
-            VectorSize::Size32x4 => ScalarSize::Size32,
+            VectorSize::Size8x8 | VectorSize::Size8x16 => ScalarSize::Size8,
+            VectorSize::Size16x4 | VectorSize::Size16x8 => ScalarSize::Size16,
+            VectorSize::Size32x2 | VectorSize::Size32x4 => ScalarSize::Size32,
            VectorSize::Size64x2 => ScalarSize::Size64,
        }
    }
@@ -743,3 +740,18 @@ impl VectorSize {
        (q, size)
    }
 }
+
+pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
+    match ty {
+        I8X8XN => I8X8,
+        I8X16XN => I8X16,
+        I16X4XN => I16X4,
+        I16X8XN => I16X8,
+        I32X2XN => I32X2,
+        I32X4XN => I32X4,
+        I64X2XN => I64X2,
+        F32X4XN => F32X4,
+        F64X2XN => F64X2,
+        _ => unreachable!("unhandled type: {}", ty),
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -89,12 +89,12 @@ pub fn mem_finalize(
 //=============================================================================
 // Instructions and subcomponents: emission

-fn machreg_to_gpr(m: Reg) -> u32 {
+pub(crate) fn machreg_to_gpr(m: Reg) -> u32 {
    assert_eq!(m.class(), RegClass::Int);
    u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
 }

-fn machreg_to_vec(m: Reg) -> u32 {
+pub(crate) fn machreg_to_vec(m: Reg) -> u32 {
    assert_eq!(m.class(), RegClass::Float);
    u32::try_from(m.to_real_reg().unwrap().hw_enc()).unwrap()
 }
@@ -2259,7 +2259,7 @@ impl MachInstEmit for Inst {
                    VectorSize::Size16x8 => 0b00010,
                    VectorSize::Size32x4 => 0b00100,
                    VectorSize::Size64x2 => 0b01000,
-                    _ => unimplemented!(),
+                    _ => unimplemented!("Unexpected VectorSize: {:?}", size),
                };
                sink.put4(
                    0b010_01110000_00000_000011_00000_00000
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1194,6 +1194,7 @@ impl MachInst for Inst {
                assert!(ty.bits() <= 128);
                Ok((&[RegClass::Float], &[I8X16]))
            }
+            _ if ty.is_dynamic_vector() => Ok((&[RegClass::Float], &[I8X16])),
            IFLAGS | FFLAGS => Ok((&[RegClass::Int], &[I64])),
            _ => Err(CodegenError::Unsupported(format!(
                "Unexpected SSA-value type: {}",
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -332,12 +332,9 @@ pub fn show_vreg_element(reg: Reg, idx: u8, size: VectorSize) -> String {
    assert_eq!(RegClass::Float, reg.class());
    let s = show_reg(reg);
    let suffix = match size {
-        VectorSize::Size8x8 => ".b",
-        VectorSize::Size8x16 => ".b",
-        VectorSize::Size16x4 => ".h",
-        VectorSize::Size16x8 => ".h",
-        VectorSize::Size32x2 => ".s",
-        VectorSize::Size32x4 => ".s",
+        VectorSize::Size8x8 | VectorSize::Size8x16 => ".b",
+        VectorSize::Size16x4 | VectorSize::Size16x8 => ".h",
+        VectorSize::Size32x2 | VectorSize::Size32x4 => ".s",
        VectorSize::Size64x2 => ".d",
    };
    format!("{}{}[{}]", s, suffix, idx)
--- a/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
@@ -117,7 +117,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/aarch64/inst_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst_neon.isle
@@ -0,0 +1,8 @@
+
+;; Move helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(decl fpu_move_128 (Reg) Reg)
+(rule (fpu_move_128 src)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuMove128 dst src))))
+        (writable_reg_to_reg dst)))
+
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -81,6 +81,9 @@
 (rule (lower (has_type ty @ (multi_lane _ _) (iadd x y)))
      (add_vec x y (vector_size ty)))

+(rule (lower (has_type ty @ (dynamic_lane _ _) (iadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Add) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
 ;; `i128`
 (rule (lower (has_type $I128 (iadd x y)))
      (let
@@ -157,6 +160,8 @@
 ;; vectors
 (rule (lower (has_type ty @ (multi_lane _ _) (isub x y)))
      (sub_vec x y (vector_size ty)))
+(rule (lower (has_type ty @ (dynamic_lane _ _) (isub x y)))
+      (value_reg (sub_vec (put_in_reg x) (put_in_reg y) (vector_size ty))))

 ;; `i128`
 (rule (lower (has_type $I128 (isub x y)))
@@ -244,6 +249,10 @@
 (rule (lower (has_type (ty_vec128 ty @ (not_i64x2)) (imul x y)))
      (mul x y (vector_size ty)))

+;; Case for 'dynamic' i8x16, i16x8, and i32x4.
+(rule (lower (has_type ty @ (dynamic_lane _ _) (imul x y)))
+      (value_reg (vec_rrr (VecALUOp.Mul) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
 ;; Special lowering for i64x2.
 ;;
 ;; This I64X2 multiplication is performed with several 32-bit
--- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
@@ -0,0 +1,30 @@
+
+;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (iadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Add) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (isub x y)))
+      (value_reg (vec_rrr (VecALUOp.Sub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type (lane_fits_in_32 ty @ (dynamic_lane _ _)) (imul x y)))
+      (value_reg (vec_rrr (VecALUOp.Mul) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (fadd x y)))
+      (value_reg (vec_rrr (VecALUOp.Fadd) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type ty @ (dynamic_lane _ _) (fsub x y)))
+      (value_reg (vec_rrr (VecALUOp.Fsub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
+
+;;; Rules for `dynamic_stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (dynamic_stack_addr stack_slot))
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (abi_dynamic_stackslot_addr dst stack_slot))))
+        (value_reg dst)))
+
+;;; Rules for `extract_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (extract_vector x 0))
+      (value_reg (fpu_move_128 (put_in_reg x))))
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -124,7 +124,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    insn,
                    &inputs[..],
                    outputs[0],
-                    |ctx, dst, elem_ty, mem| {
+                    |ctx, dst, mut elem_ty, mem| {
+                        if elem_ty.is_dynamic_vector() {
+                            elem_ty = dynamic_to_fixed(elem_ty);
+                        }
                        let rd = dst.only_reg().unwrap();
                        let is_float = ty_has_float_or_vec_representation(elem_ty);
                        ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
@@ -177,7 +180,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
            let off = ctx.data(insn).load_store_offset().unwrap();
-            let elem_ty = match op {
+            let mut elem_ty = match op {
                Opcode::Istore8 => I8,
                Opcode::Istore16 => I16,
                Opcode::Istore32 => I32,
@@ -200,6 +203,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    flags,
                });
            } else {
+                if elem_ty.is_dynamic_vector() {
+                    elem_ty = dynamic_to_fixed(elem_ty);
+                }
                let rd = dst.only_reg().unwrap();
                let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
                ctx.emit(match (ty_bits(elem_ty), is_float) {
@@ -231,12 +237,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let offset: i32 = offset.into();
-            let inst = ctx
-                .abi()
-                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
+            assert!(ctx.abi().sized_stackslot_offsets().is_valid(stack_slot));
+            let inst =
+                ctx.abi()
+                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
            ctx.emit(inst);
        }

+        Opcode::DynamicStackAddr => implemented_in_isle(ctx),
+
        Opcode::AtomicRmw => implemented_in_isle(ctx),

        Opcode::AtomicCas => implemented_in_isle(ctx),
@@ -249,7 +258,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            ctx.emit(Inst::Fence {});
        }

-        Opcode::StackLoad | Opcode::StackStore => {
+        Opcode::StackLoad
+        | Opcode::StackStore
+        | Opcode::DynamicStackStore
+        | Opcode::DynamicStackLoad => {
            panic!("Direct stack memory access not supported; should not be used by Wasm");
        }

@@ -684,7 +696,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let idx = *imm;
                let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+                let input_ty = ctx.input_ty(insn, 0);
+                let size = VectorSize::from_ty(input_ty);
                let ty = ty.unwrap();

                if ty_has_int_representation(ty) {
@@ -730,7 +743,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Splat => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let size = VectorSize::from_ty(ty.unwrap());
+            let ty = ty.unwrap();
+            // TODO: Handle SVE Dup.
+            let ty = if ty.is_dynamic_vector() {
+                dynamic_to_fixed(ty)
+            } else {
+                ty
+            };
+            let size = VectorSize::from_ty(ty);

            if let Some((_, insn)) = maybe_input_insn_multi(
                ctx,
@@ -1284,7 +1304,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            if !ty.is_vector() {
+            if !ty.is_vector() && !ty.is_dynamic_vector() {
                let fpu_op = match op {
                    Opcode::Fadd => FPUOp2::Add,
                    Opcode::Fsub => FPUOp2::Sub,
@@ -1336,7 +1356,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

            debug_assert!(lane_type == F32 || lane_type == F64);

-            if ty.is_vector() {
+            if ty.is_vector() || ty.is_dynamic_vector() {
                let size = VectorSize::from_ty(ty);

                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
@@ -2015,7 +2035,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                .map_or(true, |insn| {
                    const_param_to_u128(ctx, insn).expect("Invalid immediate bytes") != 0
                });
-            let op = match (op, ty.unwrap()) {
+            let ty = ty.unwrap();
+            let ty = if ty.is_dynamic_vector() {
+                ty.dynamic_to_vector()
+                    .unwrap_or_else(|| panic!("Unsupported dynamic type: {}?", ty))
+            } else {
+                ty
+            };
+
+            let op = match (op, ty) {
                (Opcode::Snarrow, I8X16) => VecRRNarrowOp::Sqxtn16,
                (Opcode::Snarrow, I16X8) => VecRRNarrowOp::Sqxtn32,
                (Opcode::Snarrow, I32X4) => VecRRNarrowOp::Sqxtn64,
@@ -2057,7 +2085,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let (t, high_half) = match (ty.unwrap(), op) {
+            let ty = ty.unwrap();
+            let ty = if ty.is_dynamic_vector() {
+                ty.dynamic_to_vector()
+                    .unwrap_or_else(|| panic!("Unsupported dynamic type: {}?", ty))
+            } else {
+                ty
+            };
+            let (t, high_half) = match (ty, op) {
                (I16X8, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
                (I16X8, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
                (I16X8, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
@@ -2182,6 +2217,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }

+        Opcode::ExtractVector => implemented_in_isle(ctx),
+
        Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit | Opcode::IfcmpSp => {
            return Err(CodegenError::Unsupported(format!(
                "Unimplemented lowering: {}",
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -1,7 +1,7 @@
 //! ARM 64-bit Instruction Set Architecture.

 use crate::ir::condcodes::IntCC;
-use crate::ir::Function;
+use crate::ir::{Function, Type};
 use crate::isa::aarch64::settings as aarch64_settings;
 use crate::isa::{Builder as IsaBuilder, TargetIsa};
 use crate::machinst::{
@@ -57,7 +57,7 @@ impl AArch64Backend {
        flags: shared_settings::Flags,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(flags.clone());
-        let abi = Box::new(abi::AArch64ABICallee::new(func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::AArch64ABICallee::new(func, self)?);
        compile::compile::<AArch64Backend>(func, self, abi, &self.machine_env, emit_info)
    }
 }
@@ -76,7 +76,8 @@ impl TargetIsa for AArch64Backend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -87,7 +88,8 @@ impl TargetIsa for AArch64Backend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -109,6 +111,10 @@ impl TargetIsa for AArch64Backend {
        self.isa_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn unsigned_add_overflow_condition(&self) -> IntCC {
        // Unsigned `>=`; this corresponds to the carry flag set on aarch64, which happens on
        // overflow of an add.
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -196,7 +196,7 @@ pub struct TargetFrontendConfig {
 impl TargetFrontendConfig {
    /// Get the pointer type of this target.
    pub fn pointer_type(self) -> ir::Type {
-        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+        ir::Type::int(self.pointer_bits() as u16).unwrap()
    }

    /// Get the width of pointers on this target, in units of bits.
@@ -226,6 +226,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    /// Get the ISA-dependent flag values that were used to make this trait object.
    fn isa_flags(&self) -> Vec<settings::Value>;

+    /// Get the ISA-dependent maximum vector register size, in bytes.
+    fn dynamic_vector_bytes(&self, dynamic_ty: ir::Type) -> u32;
+
    /// Compile the given function.
    fn compile_function(
        &self,
@@ -311,7 +314,7 @@ impl<'a> dyn TargetIsa + 'a {

    /// Get the pointer type of this ISA.
    pub fn pointer_type(&self) -> ir::Type {
-        ir::Type::int(u16::from(self.pointer_bits())).unwrap()
+        ir::Type::int(self.pointer_bits() as u16).unwrap()
    }

    /// Get the width of pointers on this ISA.
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -61,6 +61,7 @@ use crate::ir;
 use crate::ir::condcodes::IntCC;
 use crate::ir::types;
 use crate::ir::MemFlags;
+use crate::ir::Signature;
 use crate::ir::Type;
 use crate::isa;
 use crate::isa::s390x::inst::*;
@@ -556,6 +557,7 @@ impl ABIMachineSpec for S390xMachineDeps {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        _: &Signature,
        _: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -633,7 +635,7 @@ impl ABIMachineSpec for S390xMachineDeps {
        unimplemented!("StructArgs not implemented for S390X yet");
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, _vector_scale: u32) -> u32 {
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
@@ -665,6 +667,7 @@ impl ABIMachineSpec for S390xMachineDeps {
    fn get_clobbered_callee_saves(
        call_conv: isa::CallConv,
        flags: &settings::Flags,
+        _sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        assert!(
@@ -688,7 +691,7 @@ impl ABIMachineSpec for S390xMachineDeps {
        _is_leaf: bool,
        _stack_args_size: u32,
        _num_clobbered_callee_saves: usize,
-        _fixed_frame_storage_size: u32,
+        _frame_storage_size: u32,
    ) -> bool {
        // The call frame set-up is handled by gen_clobber_save().
        false
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -1158,9 +1158,6 @@

 ;; Helpers for stack-slot addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(decl abi_stackslot_addr (WritableReg StackSlot Offset32) MInst)
-(extern constructor abi_stackslot_addr abi_stackslot_addr)
-
 (decl stack_addr_impl (Type StackSlot Offset32) Reg)
 (rule (stack_addr_impl ty stack_slot offset)
      (let ((dst WritableReg (temp_writable_reg ty))
--- a/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
@@ -148,7 +148,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
@@ -206,7 +206,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -2301,7 +2301,7 @@
 (decl lower_call_ret_arg (ABISig) InstOutput)
 (rule (lower_call_ret_arg (abi_no_ret_arg)) (output_none))
 (rule (lower_call_ret_arg abi @ (abi_ret_arg (abi_arg_only_slot slot)))
-      (let ((ret_arg Reg (load_addr (memarg_stack_off (abi_stack_arg_space abi) 0)))
+      (let ((ret_arg Reg (load_addr (memarg_stack_off (abi_sized_stack_arg_space abi) 0)))
            (_ Unit (copy_reg_to_arg_slot 0 slot ret_arg)))
        (output_none)))

@@ -2309,7 +2309,7 @@
 (decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput)
 (rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder))
 (rule (lower_call_rets abi (range_unwrap head tail) builder)
-      (let ((ret ValueRegs (copy_from_arg (abi_stack_arg_space abi) (abi_get_ret abi head)))
+      (let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head)))
            (_ Unit (output_builder_push builder ret)))
        (lower_call_rets abi tail builder)))

--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -197,7 +197,11 @@ impl LowerBackend for S390xBackend {
            | Opcode::SqmulRoundSat
            | Opcode::FvpromoteLow
            | Opcode::Fvdemote
-            | Opcode::IaddPairwise => {
+            | Opcode::IaddPairwise
+            | Opcode::DynamicStackLoad
+            | Opcode::DynamicStackStore
+            | Opcode::DynamicStackAddr
+            | Opcode::ExtractVector => {
                unreachable!(
                    "TODO: not yet implemented in ISLE: inst = `{}`, type = `{:?}`",
                    ctx.dfg().display_inst(ir_inst),
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -16,7 +16,7 @@ use crate::settings::Flags;
 use crate::{
    ir::{
        condcodes::*, immediates::*, types::*, AtomicRmwOp, Endianness, Inst, InstructionData,
-        MemFlags, Opcode, StackSlot, TrapCode, Value, ValueList,
+        MemFlags, Opcode, TrapCode, Value, ValueList,
    },
    isa::unwind::UnwindInst,
    machinst::{InsnOutput, LowerCtx, VCodeConstant, VCodeConstantData},
@@ -77,7 +77,7 @@ where
    }

    fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
-        let off = abi.stack_arg_space() + abi.stack_ret_space();
+        let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
        self.lower_ctx
            .abi()
            .accumulate_outgoing_args_size(off as u32);
@@ -531,17 +531,6 @@ where
        }
    }

-    #[inline]
-    fn abi_stackslot_addr(
-        &mut self,
-        dst: WritableReg,
-        stack_slot: StackSlot,
-        offset: Offset32,
-    ) -> MInst {
-        let offset = u32::try_from(i32::from(offset)).unwrap();
-        self.lower_ctx.abi().stackslot_addr(stack_slot, offset, dst)
-    }
-
    #[inline]
    fn inst_builder_new(&mut self) -> VecMInstBuilder {
        Cell::new(Vec::<MInst>::new())
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -1,7 +1,7 @@
 //! IBM Z 64-bit Instruction Set Architecture.

 use crate::ir::condcodes::IntCC;
-use crate::ir::Function;
+use crate::ir::{Function, Type};
 use crate::isa::s390x::settings as s390x_settings;
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv::RegisterMappingError;
@@ -58,7 +58,7 @@ impl S390xBackend {
        flags: shared_settings::Flags,
    ) -> CodegenResult<(VCode<inst::Inst>, regalloc2::Output)> {
        let emit_info = EmitInfo::new(flags.clone(), self.isa_flags.clone());
-        let abi = Box::new(abi::S390xABICallee::new(func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::S390xABICallee::new(func, self)?);
        compile::compile::<S390xBackend>(func, self, abi, &self.machine_env, emit_info)
    }
 }
@@ -77,7 +77,8 @@ impl TargetIsa for S390xBackend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -88,7 +89,8 @@ impl TargetIsa for S390xBackend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -110,6 +112,10 @@ impl TargetIsa for S390xBackend {
        self.isa_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn unsigned_add_overflow_condition(&self) -> IntCC {
        // The ADD LOGICAL family of instructions set the condition code
        // differently from normal comparisons, in a way that cannot be
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -1,7 +1,7 @@
 //! Implementation of the standard x64 ABI.

 use crate::ir::types::*;
-use crate::ir::{self, types, ExternalName, LibCall, MemFlags, Opcode, TrapCode, Type};
+use crate::ir::{self, types, ExternalName, LibCall, MemFlags, Opcode, Signature, TrapCode, Type};
 use crate::isa;
 use crate::isa::{unwind::UnwindInst, x64::inst::*, CallConv};
 use crate::machinst::abi_impl::*;
@@ -573,6 +573,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {

    fn gen_clobber_restore(
        call_conv: isa::CallConv,
+        sig: &Signature,
        flags: &settings::Flags,
        clobbers: &[Writable<RealReg>],
        fixed_frame_storage_size: u32,
@@ -580,7 +581,8 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    ) -> SmallVec<[Self::I; 16]> {
        let mut insts = SmallVec::new();

-        let clobbered_callee_saves = Self::get_clobbered_callee_saves(call_conv, flags, clobbers);
+        let clobbered_callee_saves =
+            Self::get_clobbered_callee_saves(call_conv, flags, sig, clobbers);
        let stack_size = fixed_frame_storage_size + compute_clobber_size(&clobbered_callee_saves);

        // Restore regs by loading from offsets of RSP. RSP will be
@@ -722,11 +724,11 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        insts
    }

-    fn get_number_of_spillslots_for_value(rc: RegClass) -> u32 {
+    fn get_number_of_spillslots_for_value(rc: RegClass, vector_scale: u32) -> u32 {
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
-            RegClass::Float => 2,
+            RegClass::Float => vector_scale / 8,
        }
    }

@@ -771,6 +773,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
    fn get_clobbered_callee_saves(
        call_conv: CallConv,
        flags: &settings::Flags,
+        _sig: &Signature,
        regs: &[Writable<RealReg>],
    ) -> Vec<Writable<RealReg>> {
        let mut regs: Vec<Writable<RealReg>> = match call_conv {
@@ -805,7 +808,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        _is_leaf: bool,
        _stack_args_size: u32,
        _num_clobbered_callee_saves: usize,
-        _fixed_frame_storage_size: u32,
+        _frame_storage_size: u32,
    ) -> bool {
        true
    }
--- a/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -144,7 +144,7 @@ mod tests {
        pos.ins().return_(&[]);

        if let Some(stack_slot) = stack_slot {
-            func.stack_slots.push(stack_slot);
+            func.sized_stack_slots.push(stack_slot);
        }

        func
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2169,6 +2169,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }

+        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
+
        Opcode::StackAddr => {
            let (stack_slot, offset) = match *ctx.data(insn) {
                InstructionData::StackLoad {
@@ -2180,9 +2182,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let offset: i32 = offset.into();
-            let inst = ctx
-                .abi()
-                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
+            let inst =
+                ctx.abi()
+                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
            ctx.emit(inst);
        }

@@ -2908,7 +2910,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        // Unimplemented opcodes below. These are not currently used by Wasm
        // lowering or other known embeddings, but should be either supported or
-        // removed eventually.
+        // removed eventually
+        Opcode::ExtractVector => {
+            unimplemented!("ExtractVector not supported");
+        }
+
        Opcode::Cls => unimplemented!("Cls not supported"),

        Opcode::Fma => unimplemented!("Fma not supported"),
@@ -2965,7 +2971,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            panic!("ALU+imm and ALU+carry ops should not appear here!");
        }

-        Opcode::StackLoad | Opcode::StackStore => {
+        Opcode::StackLoad
+        | Opcode::StackStore
+        | Opcode::DynamicStackStore
+        | Opcode::DynamicStackLoad => {
            panic!("Direct stack memory access not supported; should have been legalized");
        }

--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -3,7 +3,7 @@
 use self::inst::EmitInfo;

 use super::TargetIsa;
-use crate::ir::{condcodes::IntCC, Function};
+use crate::ir::{condcodes::IntCC, Function, Type};
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv;
 use crate::isa::x64::{inst::regs::create_reg_env_systemv, settings as x64_settings};
@@ -53,7 +53,7 @@ impl X64Backend {
        // This performs lowering to VCode, register-allocates the code, computes
        // block layout and finalizes branches. The result is ready for binary emission.
        let emit_info = EmitInfo::new(flags.clone(), self.x64_flags.clone());
-        let abi = Box::new(abi::X64ABICallee::new(&func, flags, self.isa_flags())?);
+        let abi = Box::new(abi::X64ABICallee::new(&func, self)?);
        compile::compile::<Self>(&func, self, abi, &self.reg_env, emit_info)
    }
 }
@@ -72,7 +72,8 @@ impl TargetIsa for X64Backend {
        let frame_size = emit_result.frame_size;
        let value_labels_ranges = emit_result.value_labels_ranges;
        let buffer = emit_result.buffer.finish();
-        let stackslot_offsets = emit_result.stackslot_offsets;
+        let sized_stackslot_offsets = emit_result.sized_stackslot_offsets;
+        let dynamic_stackslot_offsets = emit_result.dynamic_stackslot_offsets;

        if let Some(disasm) = emit_result.disasm.as_ref() {
            log::debug!("disassembly:\n{}", disasm);
@@ -83,7 +84,8 @@ impl TargetIsa for X64Backend {
            frame_size,
            disasm: emit_result.disasm,
            value_labels_ranges,
-            stackslot_offsets,
+            sized_stackslot_offsets,
+            dynamic_stackslot_offsets,
            bb_starts: emit_result.bb_offsets,
            bb_edges: emit_result.bb_edges,
        })
@@ -97,6 +99,10 @@ impl TargetIsa for X64Backend {
        self.x64_flags.iter().collect()
    }

+    fn dynamic_vector_bytes(&self, _dyn_ty: Type) -> u32 {
+        16
+    }
+
    fn name(&self) -> &'static str {
        "x64"
    }