diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs
index 85ecb0c2a9..bc7eb94f0d 100644
--- a/cranelift/codegen/src/isa/x86/abi.rs
+++ b/cranelift/codegen/src/isa/x86/abi.rs
@@ -6,7 +6,6 @@ use super::settings as isa_settings;
 use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
 use crate::cursor::{Cursor, CursorPosition, EncCursor};
 use crate::ir;
-use crate::ir::entities::StackSlot;
 use crate::ir::immediates::Imm64;
 use crate::ir::stackslot::{StackOffset, StackSize};
 use crate::ir::types;
@@ -19,7 +18,6 @@ use crate::regalloc::RegisterSet;
 use crate::result::CodegenResult;
 use crate::stack_layout::layout_stack;
 use alloc::borrow::Cow;
-use alloc::vec::Vec;
 use core::i32;
 use target_lexicon::{PointerWidth, Triple};
 
@@ -44,7 +42,7 @@ static RET_GPRS_WIN_FASTCALL_X64: [RU; 1] = [RU::rax];
 ///
 /// [2] https://blogs.msdn.microsoft.com/oldnewthing/20110302-00/?p=11333 "Although the x64 calling
 /// convention reserves spill space for parameters, you don’t have to use them as such"
-const WIN_SHADOW_STACK_SPACE: i32 = 32;
+const WIN_SHADOW_STACK_SPACE: StackSize = 32;
 
 /// Stack alignment requirement for functions.
 ///
@@ -87,7 +85,7 @@ impl Args {
             WIN_SHADOW_STACK_SPACE
         } else {
             0
-        } as u32;
+        };
 
         Self {
             pointer_bytes: bits / 8,
@@ -501,7 +499,7 @@ fn baldrdash_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) ->
 
     let word_size = StackSize::from(isa.pointer_bytes());
     let shadow_store_size = if func.signature.call_conv.extends_windows_fastcall() {
-        WIN_SHADOW_STACK_SPACE as u32
+        WIN_SHADOW_STACK_SPACE
     } else {
         0
     };
@@ -525,50 +523,60 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
         panic!("TODO: windows-fastcall: x86-32 not implemented yet");
     }
 
-    let csrs = callee_saved_regs_used(isa, func);
-
     // The reserved stack area is composed of:
-    //   return address + frame pointer + all callee-saved registers + shadow space
+    //   return address + frame pointer + all callee-saved registers
     //
     // Pushing the return address is an implicit function of the `call`
     // instruction. Each of the others we will then push explicitly. Then we
     // will adjust the stack pointer to make room for the rest of the required
     // space for this frame.
-    let word_size = isa.pointer_bytes() as usize;
-    let num_fprs = csrs.iter(FPR).len();
-    let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32;
+    let csrs = callee_saved_regs_used(isa, func);
+    let gpsr_stack_size = ((csrs.iter(GPR).len() + 2) * isa.pointer_bytes() as usize) as u32;
+    let fpsr_stack_size = (csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as u32;
+    let mut csr_stack_size = gpsr_stack_size + fpsr_stack_size;
 
-    // Only create an FPR stack slot if we're going to save FPRs.
-    let fpr_slot = if num_fprs > 0 {
-        // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it
-        // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be
-        // through `stack_load` and `stack_store` (see later comment about issue #1198). Even
-        // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc
-        // implies that `SpillSlot` may be eligible for certain optimizations, and we know with
-        // certainty that this space may not be reused in the function, nor moved around.
-        Some(func.create_stack_slot(ir::StackSlotData {
-            kind: ir::StackSlotKind::ExplicitSlot,
-            size: (num_fprs * types::F64X2.bytes() as usize) as u32,
-            offset: None,
-        }))
-    } else {
-        None
-    };
+    // FPRs must be saved with 16-byte alignment; because they follow the GPRs on the stack, align if needed
+    if fpsr_stack_size > 0 {
+        csr_stack_size += gpsr_stack_size & isa.pointer_bytes() as u32;
+    }
 
-    // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work
-    //       since cranelift does not support spill slots before incoming args
     func.create_stack_slot(ir::StackSlotData {
         kind: ir::StackSlotKind::IncomingArg,
-        size: csr_stack_size as u32,
-        offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)),
+        size: csr_stack_size,
+        offset: Some(-(csr_stack_size as StackOffset)),
     });
 
     let is_leaf = func.is_leaf();
+
+    // If not a leaf function, allocate an explicit stack slot at the end of the space for the callee's shadow space
+    if !is_leaf {
+        // TODO: eventually use the caller-provided shadow store as spill slot space when laying out the stack
+        func.create_stack_slot(ir::StackSlotData {
+            kind: ir::StackSlotKind::ExplicitSlot,
+            size: WIN_SHADOW_STACK_SPACE,
+            offset: None,
+        });
+    }
+
     let total_stack_size = layout_stack(&mut func.stack_slots, is_leaf, STACK_ALIGNMENT)? as i32;
-    let local_stack_size = i64::from(total_stack_size - csr_stack_size);
+
+    // Subtract the GPR saved register size from the local size because pushes are used for the saves
+    let local_stack_size = i64::from(total_stack_size - gpsr_stack_size as i32);
 
     // Add CSRs to function signature
     let reg_type = isa.pointer_type();
+    let sp_arg_index = if fpsr_stack_size > 0 {
+        let sp_arg = ir::AbiParam::special_reg(
+            reg_type,
+            ir::ArgumentPurpose::CalleeSaved,
+            RU::rsp as RegUnit,
+        );
+        let index = func.signature.params.len();
+        func.signature.params.push(sp_arg);
+        Some(index)
+    } else {
+        None
+    };
     let fp_arg = ir::AbiParam::special_reg(
         reg_type,
         ir::ArgumentPurpose::FramePointer,
@@ -601,7 +609,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
         local_stack_size,
         reg_type,
         &csrs,
-        fpr_slot.as_ref(),
+        sp_arg_index.is_some(),
         isa,
     );
 
@@ -612,7 +620,8 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
         local_stack_size,
         reg_type,
         &csrs,
-        fpr_slot.as_ref(),
+        sp_arg_index,
+        isa,
     );
 
     Ok(())
@@ -649,14 +658,20 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
 
     // Add CSRs to function signature
     let reg_type = ir::Type::int(u16::from(pointer_width.bits())).unwrap();
-    if isa.pointer_bits() == 32 {
+    // On X86-32 all parameters, including vmctx, are passed on stack, and we need
+    // to extract vmctx from the stack before we can save the frame pointer.
+    let sp_arg_index = if isa.pointer_bits() == 32 {
         let sp_arg = ir::AbiParam::special_reg(
             reg_type,
             ir::ArgumentPurpose::CalleeSaved,
             RU::rsp as RegUnit,
         );
+        let index = func.signature.params.len();
         func.signature.params.push(sp_arg);
-    }
+        Some(index)
+    } else {
+        None
+    };
     let fp_arg = ir::AbiParam::special_reg(
         reg_type,
         ir::ArgumentPurpose::FramePointer,
@@ -674,11 +689,25 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C
     // Set up the cursor and insert the prologue
     let entry_block = func.layout.entry_block().expect("missing entry block");
     let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block);
-    insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa);
+    insert_common_prologue(
+        &mut pos,
+        local_stack_size,
+        reg_type,
+        &csrs,
+        sp_arg_index.is_some(),
+        isa,
+    );
 
     // Reset the cursor and insert the epilogue
     let mut pos = pos.at_position(CursorPosition::Nowhere);
-    insert_common_epilogues(&mut pos, local_stack_size, reg_type, &csrs, None);
+    insert_common_epilogues(
+        &mut pos,
+        local_stack_size,
+        reg_type,
+        &csrs,
+        sp_arg_index,
+        isa,
+    );
 
     Ok(())
 }
@@ -690,12 +719,10 @@ fn insert_common_prologue(
     stack_size: i64,
     reg_type: ir::types::Type,
     csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    has_sp_param: bool,
     isa: &dyn TargetIsa,
 ) {
-    // On X86-32 all parameters, including vmctx, are passed on stack, and we need
-    // to extract vmctx from the stack before we can save the frame pointer.
-    let sp = if isa.pointer_bits() == 32 {
+    let sp = if has_sp_param {
         let block = pos.current_block().expect("missing block under cursor");
         let sp = pos.func.dfg.append_block_param(block, reg_type);
         pos.func.locations[sp] = ir::ValueLoc::Reg(RU::rsp as RegUnit);
@@ -799,38 +826,27 @@ fn insert_common_prologue(
         }
     }
 
-    // Now that RSP is prepared for the function, we can use stack slots:
+    // With the stack pointer adjusted, save any callee-saved floating point registers via offset
+    // FPR saves are at the highest addresses of the local frame allocation, immediately following the GPR pushes
     let mut last_fpr_save = None;
-    if let Some(fpr_slot) = fpr_slot {
-        debug_assert!(csrs.iter(FPR).len() != 0);
 
-        // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base
-        // address. We are well after postopt could run, so load the CSR region base once here,
-        // instead of hoping that the addr/store will be combined later.
-        // See also: https://github.com/bytecodealliance/wasmtime/pull/1198
-        let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
+    for (i, reg) in csrs.iter(FPR).enumerate() {
+        // Append param to entry block
+        let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
 
-        // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
-        // function entry.
-        pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
+        // Since regalloc has already run, we must assign a location.
+        pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
 
-        let mut fpr_offset = 0;
+        // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+        let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+            + (stack_size & isa.pointer_bytes() as i64);
 
-        for reg in csrs.iter(FPR) {
-            // Append param to entry Block
-            let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2);
-
-            // Since regalloc has already run, we must assign a location.
-            pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg);
-
-            last_fpr_save =
-                Some(
-                    pos.ins()
-                        .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset),
-                );
-
-            fpr_offset += types::F64X2.bytes() as i32;
-        }
+        last_fpr_save = Some(pos.ins().store(
+            ir::MemFlags::trusted(),
+            csr_arg,
+            sp.expect("FPR save requires SP param"),
+            (stack_size - offset) as i32,
+        ));
     }
 
     pos.func.prologue_end = Some(
@@ -966,13 +982,14 @@ fn insert_common_epilogues(
     stack_size: i64,
     reg_type: ir::types::Type,
     csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    sp_arg_index: Option<usize>,
+    isa: &dyn TargetIsa,
 ) {
     while let Some(block) = pos.next_block() {
         pos.goto_last_inst(block);
         if let Some(inst) = pos.current_inst() {
             if pos.func.dfg[inst].opcode().is_return() {
-                insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, fpr_slot);
+                insert_common_epilogue(inst, stack_size, pos, reg_type, csrs, sp_arg_index, isa);
             }
         }
     }
@@ -986,56 +1003,9 @@ fn insert_common_epilogue(
     pos: &mut EncCursor,
     reg_type: ir::types::Type,
     csrs: &RegisterSet,
-    fpr_slot: Option<&StackSlot>,
+    sp_arg_index: Option<usize>,
+    isa: &dyn TargetIsa,
 ) {
-    // Even though instructions to restore FPRs are inserted first, we have to append them after
-    // restored GPRs to satisfy parameter order in the return.
-    let mut restored_fpr_values = Vec::new();
-
-    // Restore FPRs before we move RSP and invalidate stack slots.
-    let mut first_fpr_load = None;
-    if let Some(fpr_slot) = fpr_slot {
-        debug_assert!(csrs.iter(FPR).len() != 0);
-
-        // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base
-        // address. We are well after postopt could run, so load the CSR region base once here,
-        // instead of hoping that the addr/store will be combined later.
-        //
-        // See also: https://github.com/bytecodealliance/wasmtime/pull/1198
-        let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0);
-
-        first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
-
-        // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at
-        // function exit.
-        pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16);
-
-        let mut fpr_offset = 0;
-
-        for reg in csrs.iter(FPR) {
-            let value = pos.ins().load(
-                types::F64X2,
-                ir::MemFlags::trusted(),
-                stack_addr,
-                fpr_offset,
-            );
-            fpr_offset += types::F64X2.bytes() as i32;
-
-            // Unlike GPRs before, we don't need to step back after reach restoration because FPR
-            // restoration is order-insensitive. Furthermore: we want GPR restoration to begin
-            // after FPR restoration, so that stack adjustments occur after we're done relying on
-            // StackSlot validity.
-
-            pos.func.locations[value] = ir::ValueLoc::Reg(reg);
-            restored_fpr_values.push(value);
-        }
-    }
-
-    let mut sp_adjust_inst = None;
-    if stack_size > 0 {
-        sp_adjust_inst = Some(pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)));
-    }
-
     // Insert the pop of the frame pointer
     let fp_pop = pos.ins().x86_pop(reg_type);
     let fp_pop_inst = pos.prev_inst().unwrap();
@@ -1046,13 +1016,47 @@ fn insert_common_epilogue(
     let mut first_csr_pop_inst = None;
     for reg in csrs.iter(GPR) {
         let csr_pop = pos.ins().x86_pop(reg_type);
-        first_csr_pop_inst = Some(pos.prev_inst().unwrap());
+        first_csr_pop_inst = pos.prev_inst();
+        assert!(first_csr_pop_inst.is_some());
         pos.func.locations[csr_pop] = ir::ValueLoc::Reg(reg);
         pos.func.dfg.append_inst_arg(inst, csr_pop);
     }
 
-    for value in restored_fpr_values.into_iter() {
-        pos.func.dfg.append_inst_arg(inst, value);
+    // Insert the adjustment of SP
+    let mut sp_adjust_inst = None;
+    if stack_size > 0 {
+        pos.ins().adjust_sp_up_imm(Imm64::new(stack_size));
+        sp_adjust_inst = pos.prev_inst();
+        assert!(sp_adjust_inst.is_some());
+    }
+
+    let mut first_fpr_load = None;
+    if let Some(index) = sp_arg_index {
+        let sp = pos
+            .func
+            .dfg
+            .block_params(pos.func.layout.entry_block().unwrap())[index];
+
+        // Insert the FPR loads (unlike the GPRs, which are stack pops, these are in-order loads)
+        for (i, reg) in csrs.iter(FPR).enumerate() {
+            // Offset to where the register is saved relative to RSP, accounting for FPR save alignment
+            let offset = ((i + 1) * types::F64X2.bytes() as usize) as i64
+                + (stack_size & isa.pointer_bytes() as i64);
+
+            let value = pos.ins().load(
+                types::F64X2,
+                ir::MemFlags::trusted(),
+                sp,
+                (stack_size - offset) as i32,
+            );
+
+            first_fpr_load.get_or_insert(pos.current_inst().expect("current inst"));
+
+            pos.func.locations[value] = ir::ValueLoc::Reg(reg);
+            pos.func.dfg.append_inst_arg(inst, value);
+        }
+    } else {
+        assert!(csrs.iter(FPR).len() == 0);
     }
 
     pos.func.epilogues_start.push(
diff --git a/cranelift/codegen/src/isa/x86/unwind/winx64.rs b/cranelift/codegen/src/isa/x86/unwind/winx64.rs
index 60aff23f19..4a9af95be9 100644
--- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs
+++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs
@@ -28,22 +28,7 @@ pub(crate) fn create_unwind_info(
     let mut prologue_size = 0;
     let mut unwind_codes = Vec::new();
     let mut found_end = false;
-
-    // Have we saved at least one FPR? if so, we might have to check additional constraints.
-    let mut saved_fpr = false;
-
-    // In addition to the min offset for a callee-save, we need to know the offset from the
-    // frame base to the stack pointer, so that we can record an unwind offset that spans only
-    // to the end of callee-save space.
-    let mut static_frame_allocation_size = 0u32;
-
-    // For the time being, FPR preservation is split into a stack_addr and later store/load.
-    // Store the register used for stack store and ensure it is the same register with no
-    // intervening changes to the frame size.
-    let mut callee_save_region_reg = None;
-    // Also record the callee-save region's offset from RSP, because it must be added to FPR
-    // save offsets to compute an offset from the frame base.
-    let mut callee_save_offset = None;
+    let mut xmm_save_count: u8 = 0;
 
     for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) {
         // x64 ABI prologues cannot exceed 255 bytes in length
@@ -60,8 +45,6 @@ pub(crate) fn create_unwind_info(
             InstructionData::Unary { opcode, arg } => {
                 match opcode {
                     Opcode::X86Push => {
-                        static_frame_allocation_size += 8;
-
                         unwind_codes.push(UnwindCode::PushRegister {
                             offset: unwind_offset,
                             reg: GPR.index_of(func.locations[arg].unwrap_reg()) as u8,
@@ -70,7 +53,6 @@ pub(crate) fn create_unwind_info(
                     Opcode::AdjustSpDown => {
                         let stack_size =
                             stack_size.expect("expected a previous stack size instruction");
-                        static_frame_allocation_size += stack_size;
 
                         // This is used when calling a stack check function
                         // We need to track the assignment to RAX which has the size of the stack
@@ -85,10 +67,6 @@ pub(crate) fn create_unwind_info(
             InstructionData::CopySpecial { src, dst, .. } => {
                 if let Some(frame_register) = frame_register {
                     if src == (RU::rsp as RegUnit) && dst == frame_register {
-                        // Constructing an rbp-based stack frame, so the static frame
-                        // allocation restarts at 0 from here.
-                        static_frame_allocation_size = 0;
-
                         unwind_codes.push(UnwindCode::SetFramePointer {
                             offset: unwind_offset,
                             sp_offset: 0,
@@ -113,7 +91,7 @@ pub(crate) fn create_unwind_info(
                         let imm: i64 = imm.into();
                         assert!(imm <= core::u32::MAX as i64);
 
-                        static_frame_allocation_size += imm as u32;
+                        stack_size = Some(imm as u32);
 
                         unwind_codes.push(UnwindCode::StackAlloc {
                             offset: unwind_offset,
@@ -123,52 +101,27 @@ pub(crate) fn create_unwind_info(
                     _ => {}
                 }
             }
-            InstructionData::StackLoad {
-                opcode: Opcode::StackAddr,
-                stack_slot,
-                offset: _,
-            } => {
-                let result = func.dfg.inst_results(inst).get(0).unwrap();
-                if let ValueLoc::Reg(frame_reg) = func.locations[*result] {
-                    callee_save_region_reg = Some(frame_reg);
-
-                    // Figure out the offset in the call frame that `frame_reg` will have.
-                    let frame_size = func
-                        .stack_slots
-                        .layout_info
-                        .expect("func's stack slots have layout info if stack operations exist")
-                        .frame_size;
-                    // Because we're well after the prologue has been constructed, stack slots
-                    // must have been laid out...
-                    let slot_offset = func.stack_slots[stack_slot]
-                        .offset
-                        .expect("callee-save slot has an offset computed");
-                    let frame_offset = frame_size as i32 + slot_offset;
-
-                    callee_save_offset = Some(frame_offset as u32);
-                }
-            }
             InstructionData::Store {
                 opcode: Opcode::Store,
                 args: [arg1, arg2],
-                flags: _flags,
                 offset,
+                ..
             } => {
-                if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) =
+                if let (ValueLoc::Reg(src), ValueLoc::Reg(dst)) =
                     (func.locations[arg1], func.locations[arg2])
                 {
-                    if Some(base_ru) == callee_save_region_reg {
-                        let offset_int: i32 = offset.into();
-                        assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error");
-                        let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset");
-                        if FPR.contains(ru) {
-                            saved_fpr = true;
-                            unwind_codes.push(UnwindCode::SaveXmm {
-                                offset: unwind_offset,
-                                reg: ru as u8,
-                                stack_offset: offset_int,
-                            });
-                        }
+                    // If this is a save of an FPR, record an unwind operation
+                    // Note: the stack_offset here is relative to an adjusted SP
+                    // This will be fixed up later to be based on the frame pointer offset
+                    if dst == (RU::rsp as RegUnit) && FPR.contains(src) {
+                        let offset: i32 = offset.into();
+                        unwind_codes.push(UnwindCode::SaveXmm {
+                            offset: unwind_offset,
+                            reg: src as u8,
+                            stack_offset: offset as u32,
+                        });
+
+                        xmm_save_count += 1;
                     }
                 }
             }
@@ -183,41 +136,41 @@ pub(crate) fn create_unwind_info(
 
     assert!(found_end);
 
-    if saved_fpr {
-        if static_frame_allocation_size > 240 && saved_fpr {
-            warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \
-                This is a Cranelift implementation limit, see \
-                https://github.com/bytecodealliance/wasmtime/issues/1475",
-                static_frame_allocation_size);
-            return Err(CodegenError::ImplLimitExceeded);
+    let mut frame_register_offset = 0;
+    if xmm_save_count > 0 {
+        // If there are XMM saves, determine the number of 16-byte slots used for all CSRs (including GPRs)
+        // The "frame register offset" will point at the last slot used (i.e. the last saved FPR)
+        // Assumption: each FPR is stored at a lower address than the previous one
+        let mut last_stack_offset = None;
+        let mut fpr_save_count: u8 = 0;
+        let mut gpr_push_count: u8 = 0;
+        for code in unwind_codes.iter_mut() {
+            match code {
+                UnwindCode::SaveXmm { stack_offset, .. } => {
+                    if let Some(last) = last_stack_offset {
+                        assert!(last > *stack_offset);
+                    }
+                    last_stack_offset = Some(*stack_offset);
+                    fpr_save_count += 1;
+                    *stack_offset = (xmm_save_count - fpr_save_count) as u32 * 16;
+                }
+                UnwindCode::PushRegister { .. } => {
+                    gpr_push_count += 1;
+                }
+                _ => {}
+            }
         }
-        // Only test static frame size is 16-byte aligned when an FPR is saved to avoid
-        // panicking when alignment is elided because no FPRs are saved and no child calls are
-        // made.
-        assert!(
-            static_frame_allocation_size % 16 == 0,
-            "static frame allocation must be a multiple of 16"
-        );
-    }
+        assert_eq!(fpr_save_count, xmm_save_count);
 
-    // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at
-    // one end of the call frame, and RSP at the other, required offsets are arbitrarily large.
-    // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large
-    // frames are inexpressible, and we cannot actually compile the function. In case there are
-    // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing
-    // will actually check it. This, then, avoids panics when compiling functions with large
-    // call frames.
-    let reported_frame_offset = if saved_fpr {
-        (static_frame_allocation_size / 16) as u8
-    } else {
-        0
-    };
+        // Account for alignment space when there's an odd number of GPR pushes
+        frame_register_offset = fpr_save_count + ((gpr_push_count + 1) / 2);
+    }
 
     Ok(Some(UnwindInfo {
         flags: 0, // this assumes cranelift functions have no SEH handlers
         prologue_size: prologue_size as u8,
         frame_register: frame_register.map(|r| GPR.index_of(r) as u8),
-        frame_register_offset: reported_frame_offset,
+        frame_register_offset,
         unwind_codes,
     }))
 }
@@ -284,7 +237,7 @@ mod tests {
                     },
                     UnwindCode::StackAlloc {
                         offset: 9,
-                        size: 64 + 32
+                        size: 64
                     }
                 ]
             }
@@ -303,7 +256,7 @@ mod tests {
                 0x03, // Unwind code count (1 for stack alloc, 1 for save frame reg, 1 for push reg)
                 0x05, // Frame register + offset (RBP with 0 offset)
                 0x09, // Prolog offset
-                0xB2, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0xB * 8) + 8 = 96 (64 + 32) bytes)
+                0x72, // Operation 2 (small stack alloc), size = 0xB slots (e.g. (0x7 * 8) + 8 = 64 bytes)
                 0x05, // Prolog offset
                 0x03, // Operation 3 (save frame register), stack pointer offset = 0
                 0x02, // Prolog offset
@@ -349,7 +302,7 @@ mod tests {
                     },
                     UnwindCode::StackAlloc {
                         offset: 27,
-                        size: 10000 + 32
+                        size: 10000
                     }
                 ]
             }
@@ -369,8 +322,8 @@ mod tests {
                 0x05, // Frame register + offset (RBP with 0 offset)
                 0x1B, // Prolog offset
                 0x01, // Operation 1 (large stack alloc), size is scaled 16-bits (info = 0)
-                0xE6, // Low size byte
-                0x04, // High size byte (e.g. 0x04E6 * 8 = 100032 (10000 + 32) bytes)
+                0xE2, // Low size byte
+                0x04, // High size byte (e.g. 0x04E2 * 8 = 10000 bytes)
                 0x05, // Prolog offset
                 0x03, // Operation 3 (save frame register), stack pointer offset = 0
                 0x02, // Prolog offset
@@ -414,7 +367,7 @@ mod tests {
                     },
                     UnwindCode::StackAlloc {
                         offset: 27,
-                        size: 1000000 + 32
+                        size: 1000000
                     }
                 ]
             }
@@ -434,10 +387,10 @@ mod tests {
                 0x05, // Frame register + offset (RBP with 0 offset)
                 0x1B, // Prolog offset
                 0x11, // Operation 1 (large stack alloc), size is unscaled 32-bits (info = 1)
-                0x60, // Byte 1 of size
+                0x40, // Byte 1 of size
                 0x42, // Byte 2 of size
                 0x0F, // Byte 3 of size
-                0x00, // Byte 4 of size (size is 0xF4260 = 1000032 (1000000 + 32) bytes)
+                0x00, // Byte 4 of size (size is 0xF4240 = 1000000 bytes)
                 0x05, // Prolog offset
                 0x03, // Operation 3 (save frame register), stack pointer offset = 0
                 0x02, // Prolog offset
diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif
index bf77c0baef..8e8d356479 100644
--- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif
+++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif
@@ -8,29 +8,57 @@ function %one_arg(i64) windows_fastcall {
 block0(v0: i64):
     return
 }
-; check: function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
-; nextln: ss0 = incoming_arg 16, offset -48
+; check:  function %one_arg(i64 [%rcx], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 16, offset -16
+; check:  block0(v0: i64 [%rcx], v1: i64 [%rbp]):
+; nextln: x86_push v1
+; nextln: copy_special %rsp -> %rbp
+; nextln: v2 = x86_pop.i64
+; nextln: return v2
+; nextln: }
 
 ; check if we still use registers for 4 arguments
 function %four_args(i64, i64, i64, i64) windows_fastcall {
 block0(v0: i64, v1: i64, v2: i64, v3: i64):
     return
 }
-; check: function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; check:  function %four_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 16, offset -16
+; check:  block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
+; nextln: x86_push v4
+; nextln: copy_special %rsp -> %rbp
+; nextln: v5 = x86_pop.i64
+; nextln: return v5
+; nextln: }
 
 ; check if float arguments are passed through XMM registers
 function %four_float_args(f64, f64, f64, f64) windows_fastcall {
 block0(v0: f64, v1: f64, v2: f64, v3: f64):
     return
 }
-; check: function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; check:  function %four_float_args(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 16, offset -16
+; check:  block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v4: i64 [%rbp]):
+; nextln: x86_push v4
+; nextln: copy_special %rsp -> %rbp
+; nextln: v5 = x86_pop.i64
+; nextln: return v5
+; nextln: }
 
 ; check if we use stack space for > 4 arguments
 function %five_args(i64, i64, i64, i64, i64) windows_fastcall {
 block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64):
     return
 }
-; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; check:  function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 8, offset 32
+; nextln: ss1 = incoming_arg 16, offset -16
+; check:  block0(v0: i64 [%rcx], v1: i64 [%rdx], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [ss0], v5: i64 [%rbp]):
+; nextln: x86_push v5
+; nextln: copy_special %rsp -> %rbp
+; nextln: v6 = x86_pop.i64
+; nextln: return v6
+; nextln: }
 
 ; check that we preserve xmm6 and above if we're using them locally
 function %float_callee_saves(f64, f64, f64, f64) windows_fastcall {
@@ -40,38 +68,51 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64):
 [-, %xmm7]  v5 = fadd v0, v1
     return
 }
-; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
-; nextln:                         ss0 = explicit_slot 32, offset -80
-; nextln:                         ss1 = incoming_arg 16, offset -48
-; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]):
-; nextln:                         x86_push v6
-; nextln:                         copy_special %rsp -> %rbp
-; nextln:                         adjust_sp_down_imm 64
-; nextln:                         v7 = stack_addr.i64 ss0
-; nextln:                         store notrap aligned v8, v7
-; nextln:                         store notrap aligned v9, v7+16
-; check:                          v10 = stack_addr.i64 ss0
-; nextln:                         v11 = load.f64x2 notrap aligned v10
-; nextln:                         v12 = load.f64x2 notrap aligned v10+16
-; nextln:                         adjust_sp_up_imm 64
-; nextln:                         v13 = x86_pop.i64
-; nextln:                         v13, v11, v12
+; check:  function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 csr [%rsp], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall {
+; nextln: ss0 = incoming_arg 48, offset -48
+; check:  block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rsp], v7: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): 
+; nextln: x86_push v7
+; nextln: copy_special %rsp -> %rbp
+; nextln: adjust_sp_down_imm 32
+; nextln: store notrap aligned v8, v6+16
+; nextln: store notrap aligned v9, v6
+; nextln: v11 = load.f64x2 notrap aligned v6+16
+; nextln: v12 = load.f64x2 notrap aligned v6
+; nextln: adjust_sp_up_imm 32
+; nextln: v10 = x86_pop.i64
+; nextln: return v10, v11, v12
+; nextln: }
 
 function %mixed_int_float(i64, f64, i64, f32) windows_fastcall {
 block0(v0: i64, v1: f64, v2: i64, v3: f32):
     return
 }
-; check: function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; check:  function %mixed_int_float(i64 [%rcx], f64 [%xmm1], i64 [%r8], f32 [%xmm3], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 16, offset -16
+; check:  block0(v0: i64 [%rcx], v1: f64 [%xmm1], v2: i64 [%r8], v3: f32 [%xmm3], v4: i64 [%rbp]):
+; nextln: x86_push v4
+; nextln: copy_special %rsp -> %rbp
+; nextln: v5 = x86_pop.i64
+; nextln: return v5
+; nextln: }
 
 function %ret_val_float(f32, f64, i64, i64) -> f64 windows_fastcall {
 block0(v0: f32, v1: f64, v2: i64, v3: i64):
     return v1
 }
-; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
+; check:  function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall {
+; nextln: ss0 = incoming_arg 16, offset -16
+; check:  block0(v0: f32 [%xmm0], v1: f64 [%xmm1], v2: i64 [%r8], v3: i64 [%r9], v4: i64 [%rbp]):
+; nextln: x86_push v4
+; nextln: copy_special %rsp -> %rbp
+; nextln: regmove v1, %xmm1 -> %xmm0
+; nextln: v5 = x86_pop.i64
+; nextln: return v1, v5
+; nextln: }
 
 function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall {
-  fn0 = %foo(i64, i64, i64, i64) -> i64
-  fn1 = %foo2(i64, i64, i64, i64) -> i64
+  fn0 = %foo(i64, i64, i64, i64) -> i64 windows_fastcall
+  fn1 = %foo2(i64, i64, i64, i64) -> i64 windows_fastcall
 block0(v0: i64):
     v1 = load.i64 v0+0
     v2 = load.i64 v0+8
@@ -94,3 +135,100 @@ block0(v0: i64):
     store.i64 v9, v0+72
     return v10
 }
+; check:  function %internal_stack_a(i64 [%rcx], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 [%rax], i64 fp [%rbp], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] windows_fastcall {
+; nextln: ss0 = spill_slot 8, offset -56
+; nextln: ss1 = spill_slot 8, offset -64
+; nextln: ss2 = spill_slot 8, offset -72
+; nextln: ss3 = spill_slot 8, offset -80
+; nextln: ss4 = spill_slot 8, offset -88
+; nextln: ss5 = spill_slot 8, offset -96
+; nextln: ss6 = spill_slot 8, offset -104
+; nextln: ss7 = spill_slot 8, offset -112
+; nextln: ss8 = spill_slot 8, offset -120
+; nextln: ss9 = spill_slot 8, offset -128
+; nextln: ss10 = incoming_arg 48, offset -48
+; nextln: ss11 = explicit_slot 32, offset -160
+; nextln: sig0 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
+; nextln: sig1 = (i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9]) -> i64 [%rax] windows_fastcall
+; nextln: fn0 = %foo sig0
+; nextln: fn1 = %foo2 sig1
+; check:  block0(v11: i64 [%rcx], v52: i64 [%rbp], v53: i64 [%r12], v54: i64 [%r13], v55: i64 [%r14], v56: i64 [%r15]):
+; nextln: x86_push v52
+; nextln: copy_special %rsp -> %rbp
+; nextln: x86_push v53
+; nextln: x86_push v54
+; nextln: x86_push v55
+; nextln: x86_push v56
+; nextln: adjust_sp_down_imm 112
+; nextln: v0 = spill v11
+; nextln: v12 = copy_to_ssa.i64 %rcx
+; nextln: v13 = load.i64 v12
+; nextln: v1 = spill v13
+; nextln: v14 = fill_nop v0
+; nextln: v15 = load.i64 v14+8
+; nextln: v2 = spill v15
+; nextln: v16 = fill_nop v0
+; nextln: v17 = load.i64 v16+16
+; nextln: v3 = spill v17
+; nextln: v18 = fill_nop v0
+; nextln: v19 = load.i64 v18+24
+; nextln: v4 = spill v19
+; nextln: v20 = fill_nop v0
+; nextln: v21 = load.i64 v20+32
+; nextln: v5 = spill v21
+; nextln: v22 = fill_nop v0
+; nextln: v23 = load.i64 v22+40
+; nextln: v6 = spill v23
+; nextln: v24 = fill_nop v0
+; nextln: v25 = load.i64 v24+48
+; nextln: v7 = spill v25
+; nextln: v26 = fill_nop v0
+; nextln: v27 = load.i64 v26+56
+; nextln: v8 = spill v27
+; nextln: v28 = fill_nop v0
+; nextln: v29 = load.i64 v28+64
+; nextln: v9 = spill v29
+; nextln: v30 = fill v1
+; nextln: v31 = fill v2
+; nextln: v32 = fill v3
+; nextln: v33 = fill v4
+; nextln: regmove v30, %r15 -> %rcx
+; nextln: regmove v31, %r14 -> %rdx
+; nextln: regmove v32, %r13 -> %r8
+; nextln: regmove v33, %r12 -> %r9
+; nextln: v10 = call fn0(v30, v31, v32, v33)
+; nextln: v34 = fill v1
+; nextln: v35 = fill v0
+; nextln: store v34, v35+8
+; nextln: v36 = fill v2
+; nextln: v37 = fill_nop v0
+; nextln: store v36, v37+16
+; nextln: v38 = fill v3
+; nextln: v39 = fill_nop v0
+; nextln: store v38, v39+24
+; nextln: v40 = fill v4
+; nextln: v41 = fill_nop v0
+; nextln: store v40, v41+32
+; nextln: v42 = fill v5
+; nextln: v43 = fill_nop v0
+; nextln: store v42, v43+40
+; nextln: v44 = fill v6
+; nextln: v45 = fill_nop v0
+; nextln: store v44, v45+48
+; nextln: v46 = fill v7
+; nextln: v47 = fill_nop v0
+; nextln: store v46, v47+56
+; nextln: v48 = fill v8
+; nextln: v49 = fill_nop v0
+; nextln: store v48, v49+64
+; nextln: v50 = fill v9
+; nextln: v51 = fill_nop v0
+; nextln: store v50, v51+72
+; nextln: adjust_sp_up_imm 112
+; nextln: v61 = x86_pop.i64 
+; nextln: v60 = x86_pop.i64 
+; nextln: v59 = x86_pop.i64 
+; nextln: v58 = x86_pop.i64 
+; nextln: v57 = x86_pop.i64 
+; nextln: return v10, v57, v58, v59, v60, v61
+; nextln: }
\ No newline at end of file
diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif
index 6997238bfb..4e5d4f18f3 100644
--- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif
+++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif
@@ -3,13 +3,35 @@ set opt_level=speed_and_size
 set is_pic
 target x86_64 haswell
 
-; check the unwind information with a function with no args
-function %no_args() windows_fastcall {
+; check the unwind information with a leaf function with no args
+function %no_args_leaf() windows_fastcall {
 block0:
     return
 }
 ; sameln:                version: 1
 ; nextln:                  flags: 0
+; nextln:          prologue size: 4
+; nextln:         frame register: 5
+; nextln:  frame register offset: 0
+; nextln:           unwind codes: 2
+; nextln:  
+; nextln:                 offset: 1
+; nextln:                     op: PushNonvolatileRegister
+; nextln:                   info: 5
+; nextln:  
+; nextln:                 offset: 4
+; nextln:                     op: SetFramePointer
+; nextln:                   info: 0
+
+; check the unwind information with a non-leaf function with no args
+function %no_args() windows_fastcall {
+    fn0 = %foo()
+block0:
+    call fn0()
+    return
+}
+; sameln:                version: 1
+; nextln:                  flags: 0
 ; nextln:          prologue size: 8
 ; nextln:         frame register: 5
 ; nextln:  frame register offset: 0
@@ -51,7 +73,7 @@ block0:
 ; nextln:                offset: 17
 ; nextln:                    op: LargeStackAlloc
 ; nextln:                  info: 0
-; nextln:                 value: 12504 (u16)
+; nextln:                 value: 12500 (u16)
 
 ; check a function with large-sized stack alloc
 function %large_stack() windows_fastcall {
@@ -77,7 +99,7 @@ block0:
 ; nextln:                 offset: 17
 ; nextln:                     op: LargeStackAlloc
 ; nextln:                   info: 1
-; nextln:                  value: 524320 (u32)
+; nextln:                  value: 524288 (u32)
 
 function %fpr_with_function_call(i64, i64) windows_fastcall {
     fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall;
@@ -113,9 +135,9 @@ block0(v0: i64, v1: i64):
 ;
 ; sameln:                version: 1
 ; nextln:                  flags: 0 
-; nextln:          prologue size: 25
+; nextln:          prologue size: 22
 ; nextln:         frame register: 5
-; nextln:  frame register offset: 12
+; nextln:  frame register offset: 2
 ; nextln:           unwind codes: 5
 ; nextln:  
 ; nextln:                 offset: 1
@@ -135,10 +157,10 @@ block0(v0: i64, v1: i64):
 ; nextln:                   info: 0
 ; nextln:                  value: 23 (u16)
 ; nextln:  
-; nextln:                 offset: 25
+; nextln:                 offset: 22
 ; nextln:                     op: SaveXmm128
 ; nextln:                   info: 15
-; nextln:                  value: 3 (u16)
+; nextln:                  value: 0 (u16)
 
 ; check a function that has CSRs
 function %lots_of_registers(i64, i64) windows_fastcall {
@@ -191,9 +213,9 @@ block0(v0: i64, v1: i64):
 }
 ; sameln:                version: 1
 ; nextln:                  flags: 0
-; nextln:          prologue size: 41
+; nextln:          prologue size: 35
 ; nextln:         frame register: 5
-; nextln:  frame register offset: 10
+; nextln:  frame register offset: 7
 ; nextln:           unwind codes: 13
 ; nextln:  
 ; nextln:                 offset: 1
@@ -234,19 +256,19 @@ block0(v0: i64, v1: i64):
 ; nextln:  
 ; nextln:                 offset: 19
 ; nextln:                     op: SmallStackAlloc
-; nextln:                   info: 12
+; nextln:                   info: 8
 ; nextln:  
-; nextln:                 offset: 31
+; nextln:                 offset: 24
 ; nextln:                     op: SaveXmm128
 ; nextln:                   info: 6
-; nextln:                  value: 0 (u16)
+; nextln:                  value: 2 (u16)
 ; nextln:  
-; nextln:                 offset: 36
+; nextln:                 offset: 29
 ; nextln:                     op: SaveXmm128
 ; nextln:                   info: 7
 ; nextln:                  value: 1 (u16)
 ; nextln:  
-; nextln:                 offset: 41
+; nextln:                 offset: 35
 ; nextln:                     op: SaveXmm128
 ; nextln:                   info: 8
-; nextln:                  value: 2 (u16)
+; nextln:                  value: 0 (u16)