Many multi-value returns (#1147)

* Add x86 encodings for `bint` converting to `i8` and `i16` * Introduce tests for many multi-value returns * Support arbitrary numbers of return values This commit implements support for returning an arbitrary number of return values from a function. During legalization we transform multi-value signatures to take a struct return ("sret") return pointer, instead of returning its values in registers. Callers allocate the sret space in their stack frame and pass a pointer to it into the caller, and once the caller returns to them, they load the return values back out of the sret stack slot. The callee's return operations are legalized to store the return values through the given sret pointer. * Keep track of old, pre-legalized signatures When legalizing a call or return for its new legalized signature, we may need to look at the old signature in order to figure out how to legalize the call or return. * Add test for multi-value returns and `call_indirect` * Encode bool -> int x86 instructions in a loop * Rename `Signature::uses_sret` to `Signature::uses_struct_return_param` * Rename `p` to `param` * Add a clarifiying comment in `num_registers_required` * Rename `num_registers_required` to `num_return_registers_required` * Re-add newline * Handle already-assigned parameters in `num_return_registers_required` * Document what some debug assertions are checking for * Make "illegalizing" closure's control flow simpler * Add unit tests and comments for our rounding-up-to-the-next-multiple-of-a-power-of-2 function * Use `append_isnt_arg` instead of doing the same thing manually * Fix grammar in comment * Add `Signature::uses_special_{param,return}` helper functions * Inline the definition of `legalize_type_for_sret_load` for readability * Move sret legalization debug assertions out into their own function * Add `round_up_to_multiple_of_type_align` helper for readability * Add a debug assertion that we aren't removing the wrong return value * Rename `RetPtr` stack slots to `StructReturnSlot` * Make `legalize_type_for_sret_store` more symmetrical to `legalized_type_for_sret` * rustfmt * Remove unnecessary loop labels * Do not pre-assign offsets to struct return stack slots Instead, let the existing frame layout algorithm decide where they should go. * Expand "sret" into explicit "struct return" in doc comment * typo: "than" -> "then" in comment * Fold test's debug message into the assertion itself
2019-11-05 14:36:03 -08:00
parent 45fb377457
commit a49483408c
29 changed files with 3206 additions and 69 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1420,27 +1420,24 @@ pub(crate) fn define(
    // or 1.
    //
    // Encode movzbq as movzbl, because it's equivalent and shorter.
-    e.enc32(
-        bint.bind(I32).bind(B1),
-        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
-    );
-
-    e.enc64(
-        bint.bind(I64).bind(B1),
-        rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
-    );
-    e.enc64(
-        bint.bind(I64).bind(B1),
-        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
-    );
-    e.enc64(
-        bint.bind(I32).bind(B1),
-        rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
-    );
-    e.enc64(
-        bint.bind(I32).bind(B1),
-        rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
-    );
+    for &to in &[I8, I16, I32, I64] {
+        for &from in &[B1, B8] {
+            e.enc64(
+                bint.bind(to).bind(from),
+                rec_urm_noflags.opcodes(&MOVZX_BYTE).rex(),
+            );
+            e.enc64(
+                bint.bind(to).bind(from),
+                rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+            );
+            if to != I64 {
+                e.enc32(
+                    bint.bind(to).bind(from),
+                    rec_urm_noflags_abcd.opcodes(&MOVZX_BYTE),
+                );
+            }
+        }
+    }

    // Numerical conversions.

--- a/cranelift/codegen/src/abi.rs
+++ b/cranelift/codegen/src/abi.rs
@@ -4,6 +4,7 @@
 //! `TargetIsa::legalize_signature()` method.

 use crate::ir::{AbiParam, ArgumentExtension, ArgumentLoc, Type};
+use alloc::borrow::Cow;
 use alloc::vec::Vec;
 use core::cmp::Ordering;

@@ -86,7 +87,9 @@ pub trait ArgAssigner {
 /// Legalize the arguments in `args` using the given argument assigner.
 ///
 /// This function can be used for both arguments and return values.
-pub fn legalize_args<AA: ArgAssigner>(args: &mut Vec<AbiParam>, aa: &mut AA) {
+pub fn legalize_args<AA: ArgAssigner>(args: &[AbiParam], aa: &mut AA) -> Option<Vec<AbiParam>> {
+    let mut args = Cow::Borrowed(args);
+
    // Iterate over the arguments.
    // We may need to mutate the vector in place, so don't use a normal iterator, and clone the
    // argument to avoid holding a reference.
@@ -102,20 +105,25 @@ pub fn legalize_args<AA: ArgAssigner>(args: &mut Vec<AbiParam>, aa: &mut AA) {
        match aa.assign(&arg) {
            // Assign argument to a location and move on to the next one.
            ArgAction::Assign(loc) => {
-                args[argno].location = loc;
+                args.to_mut()[argno].location = loc;
                argno += 1;
            }
            // Split this argument into two smaller ones. Then revisit both.
            ArgAction::Convert(conv) => {
                let value_type = conv.apply(arg.value_type);
                let new_arg = AbiParam { value_type, ..arg };
-                args[argno].value_type = value_type;
+                args.to_mut()[argno].value_type = value_type;
                if conv.is_split() {
-                    args.insert(argno + 1, new_arg);
+                    args.to_mut().insert(argno + 1, new_arg);
                }
            }
        }
    }
+
+    match args {
+        Cow::Borrowed(_) => None,
+        Cow::Owned(a) => Some(a),
+    }
 }

 /// Determine the right action to take when passing a `have` value type to a call signature where
--- a/cranelift/codegen/src/ir/dfg.rs
+++ b/cranelift/codegen/src/ir/dfg.rs
@@ -62,6 +62,9 @@ pub struct DataFlowGraph {
    /// well as the external function references.
    pub signatures: PrimaryMap<SigRef, Signature>,

+    /// The pre-legalization signature for each entry in `signatures`, if any.
+    pub old_signatures: SecondaryMap<SigRef, Option<Signature>>,
+
    /// External function references. These are functions that can be called directly.
    pub ext_funcs: PrimaryMap<FuncRef, ExtFuncData>,

@@ -85,6 +88,7 @@ impl DataFlowGraph {
            value_lists: ValueListPool::new(),
            values: PrimaryMap::new(),
            signatures: PrimaryMap::new(),
+            old_signatures: SecondaryMap::new(),
            ext_funcs: PrimaryMap::new(),
            values_labels: None,
            constants: ConstantPool::new(),
--- a/cranelift/codegen/src/ir/extfunc.rs
+++ b/cranelift/codegen/src/ir/extfunc.rs
@@ -55,6 +55,53 @@ impl Signature {
    pub fn special_param_index(&self, purpose: ArgumentPurpose) -> Option<usize> {
        self.params.iter().rposition(|arg| arg.purpose == purpose)
    }
+
+    /// Find the index of a presumed unique special-purpose parameter.
+    pub fn special_return_index(&self, purpose: ArgumentPurpose) -> Option<usize> {
+        self.returns.iter().rposition(|arg| arg.purpose == purpose)
+    }
+
+    /// Does this signature have a parameter whose `ArgumentPurpose` is
+    /// `purpose`?
+    pub fn uses_special_param(&self, purpose: ArgumentPurpose) -> bool {
+        self.special_param_index(purpose).is_some()
+    }
+
+    /// Does this signature have a return whose `ArgumentPurpose` is `purpose`?
+    pub fn uses_special_return(&self, purpose: ArgumentPurpose) -> bool {
+        self.special_return_index(purpose).is_some()
+    }
+
+    /// How many special parameters does this function have?
+    pub fn num_special_params(&self) -> usize {
+        self.params
+            .iter()
+            .filter(|p| p.purpose != ArgumentPurpose::Normal)
+            .count()
+    }
+
+    /// How many special returns does this function have?
+    pub fn num_special_returns(&self) -> usize {
+        self.returns
+            .iter()
+            .filter(|r| r.purpose != ArgumentPurpose::Normal)
+            .count()
+    }
+
+    /// Does this signature take an struct return pointer parameter?
+    pub fn uses_struct_return_param(&self) -> bool {
+        self.uses_special_param(ArgumentPurpose::StructReturn)
+    }
+
+    /// Does this return more than one normal value? (Pre-struct return
+    /// legalization)
+    pub fn is_multi_return(&self) -> bool {
+        self.returns
+            .iter()
+            .filter(|r| r.purpose == ArgumentPurpose::Normal)
+            .count()
+            > 1
+    }
 }

 /// Wrapper type capable of displaying a `Signature` with correct register names.
--- a/cranelift/codegen/src/ir/function.rs
+++ b/cranelift/codegen/src/ir/function.rs
@@ -34,6 +34,10 @@ pub struct Function {
    /// Signature of this function.
    pub signature: Signature,

+    /// The old signature of this function, before the most recent legalization,
+    /// if any.
+    pub old_signature: Option<Signature>,
+
    /// Stack slots allocated in this function.
    pub stack_slots: StackSlots,

@@ -96,6 +100,7 @@ impl Function {
        Self {
            name,
            signature: sig,
+            old_signature: None,
            stack_slots: StackSlots::new(),
            global_values: PrimaryMap::new(),
            heaps: PrimaryMap::new(),
--- a/cranelift/codegen/src/ir/stackslot.rs
+++ b/cranelift/codegen/src/ir/stackslot.rs
@@ -64,6 +64,15 @@ pub enum StackSlotKind {
    /// stack slots are only valid while setting up a call.
    OutgoingArg,

+    /// Space allocated in the caller's frame for the callee's return values
+    /// that are passed out via return pointer.
+    ///
+    /// If there are more return values than registers available for the callee's calling
+    /// convention, or the return value is larger than the available registers' space, then we
+    /// allocate stack space in this frame and pass a pointer to the callee, which then writes its
+    /// return values into this space.
+    StructReturnSlot,
+
    /// An emergency spill slot.
    ///
    /// Emergency slots are allocated late when the register's constraint solver needs extra space
@@ -81,6 +90,7 @@ impl FromStr for StackSlotKind {
            "spill_slot" => Ok(SpillSlot),
            "incoming_arg" => Ok(IncomingArg),
            "outgoing_arg" => Ok(OutgoingArg),
+            "sret_slot" => Ok(StructReturnSlot),
            "emergency_slot" => Ok(EmergencySlot),
            _ => Err(()),
        }
@@ -95,6 +105,7 @@ impl fmt::Display for StackSlotKind {
            SpillSlot => "spill_slot",
            IncomingArg => "incoming_arg",
            OutgoingArg => "outgoing_arg",
+            StructReturnSlot => "sret_slot",
            EmergencySlot => "emergency_slot",
        })
    }
--- a/cranelift/codegen/src/isa/arm32/abi.rs
+++ b/cranelift/codegen/src/isa/arm32/abi.rs
@@ -6,6 +6,7 @@ use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
 use crate::ir::{self, AbiParam, ArgumentExtension, ArgumentLoc, Type};
 use crate::isa::RegClass;
 use crate::regalloc::RegisterSet;
+use alloc::borrow::Cow;
 use core::i32;
 use target_lexicon::Triple;

@@ -78,11 +79,13 @@ impl ArgAssigner for Args {
 }

 /// Legalize `sig`.
-pub fn legalize_signature(sig: &mut ir::Signature, triple: &Triple, _current: bool) {
+pub fn legalize_signature(sig: &mut Cow<ir::Signature>, triple: &Triple, _current: bool) {
    let bits = triple.pointer_width().unwrap().bits();

    let mut args = Args::new(bits);
-    legalize_args(&mut sig.params, &mut args);
+    if let Some(new_params) = legalize_args(&sig.params, &mut args) {
+        sig.to_mut().params = new_params;
+    }
 }

 /// Get register class for a type appearing in a legalized signature.
--- a/cranelift/codegen/src/isa/arm32/mod.rs
+++ b/cranelift/codegen/src/isa/arm32/mod.rs
@@ -15,6 +15,7 @@ use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encoding
 use crate::isa::Builder as IsaBuilder;
 use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
+use alloc::borrow::Cow;
 use alloc::boxed::Box;
 use core::fmt;
 use target_lexicon::{Architecture, Triple};
@@ -100,7 +101,7 @@ impl TargetIsa for Isa {
        )
    }

-    fn legalize_signature(&self, sig: &mut ir::Signature, current: bool) {
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
        abi::legalize_signature(sig, &self.triple, current)
    }

--- a/cranelift/codegen/src/isa/arm64/abi.rs
+++ b/cranelift/codegen/src/isa/arm64/abi.rs
@@ -5,10 +5,11 @@ use crate::ir;
 use crate::isa::RegClass;
 use crate::regalloc::RegisterSet;
 use crate::settings as shared_settings;
+use alloc::borrow::Cow;

 /// Legalize `sig`.
 pub fn legalize_signature(
-    _sig: &mut ir::Signature,
+    _sig: &mut Cow<ir::Signature>,
    _flags: &shared_settings::Flags,
    _current: bool,
 ) {
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -15,6 +15,7 @@ use crate::isa::enc_tables::{lookup_enclist, Encodings};
 use crate::isa::Builder as IsaBuilder;
 use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
+use alloc::borrow::Cow;
 use alloc::boxed::Box;
 use core::fmt;
 use target_lexicon::Triple;
@@ -88,7 +89,7 @@ impl TargetIsa for Isa {
        )
    }

-    fn legalize_signature(&self, sig: &mut ir::Signature, current: bool) {
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
        abi::legalize_signature(sig, &self.shared_flags, current)
    }

--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -63,6 +63,7 @@ use crate::result::CodegenResult;
 use crate::settings;
 use crate::settings::SetResult;
 use crate::timing;
+use alloc::borrow::Cow;
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::fmt;
@@ -315,7 +316,7 @@ pub trait TargetIsa: fmt::Display + Sync {
    /// Arguments and return values for the caller's frame pointer and other callee-saved registers
    /// should not be added by this function. These arguments are not added until after register
    /// allocation.
-    fn legalize_signature(&self, sig: &mut ir::Signature, current: bool);
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool);

    /// Get the register class that should be used to represent an ABI argument or return value of
    /// type `ty`. This should be the top-level register class that contains the argument
--- a/cranelift/codegen/src/isa/riscv/abi.rs
+++ b/cranelift/codegen/src/isa/riscv/abi.rs
@@ -11,6 +11,7 @@ use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion};
 use crate::ir::{self, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, Type};
 use crate::isa::RegClass;
 use crate::regalloc::RegisterSet;
+use alloc::borrow::Cow;
 use core::i32;
 use target_lexicon::Triple;

@@ -88,7 +89,7 @@ impl ArgAssigner for Args {

 /// Legalize `sig` for RISC-V.
 pub fn legalize_signature(
-    sig: &mut ir::Signature,
+    sig: &mut Cow<ir::Signature>,
    triple: &Triple,
    isa_flags: &settings::Flags,
    current: bool,
@@ -96,10 +97,14 @@ pub fn legalize_signature(
    let bits = triple.pointer_width().unwrap().bits();

    let mut args = Args::new(bits, isa_flags.enable_e());
-    legalize_args(&mut sig.params, &mut args);
+    if let Some(new_params) = legalize_args(&sig.params, &mut args) {
+        sig.to_mut().params = new_params;
+    }

    let mut rets = Args::new(bits, isa_flags.enable_e());
-    legalize_args(&mut sig.returns, &mut rets);
+    if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
+        sig.to_mut().returns = new_returns;
+    }

    if current {
        let ptr = Type::int(u16::from(bits)).unwrap();
@@ -110,8 +115,8 @@ pub fn legalize_signature(
        // in any register, but a micro-architecture with a return address predictor will only
        // recognize it as a return if the address is in `x1`.
        let link = AbiParam::special_reg(ptr, ArgumentPurpose::Link, GPR.unit(1));
-        sig.params.push(link);
-        sig.returns.push(link);
+        sig.to_mut().params.push(link);
+        sig.to_mut().returns.push(link);
    }
 }

--- a/cranelift/codegen/src/isa/riscv/mod.rs
+++ b/cranelift/codegen/src/isa/riscv/mod.rs
@@ -15,6 +15,7 @@ use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encoding
 use crate::isa::Builder as IsaBuilder;
 use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
+use alloc::borrow::Cow;
 use alloc::boxed::Box;
 use core::fmt;
 use target_lexicon::{PointerWidth, Triple};
@@ -95,7 +96,7 @@ impl TargetIsa for Isa {
        )
    }

-    fn legalize_signature(&self, sig: &mut ir::Signature, current: bool) {
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
        abi::legalize_signature(sig, &self.triple, &self.isa_flags, current)
    }

--- a/cranelift/codegen/src/isa/x86/abi.rs
+++ b/cranelift/codegen/src/isa/x86/abi.rs
@@ -17,6 +17,7 @@ use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa};
 use crate::regalloc::RegisterSet;
 use crate::result::CodegenResult;
 use crate::stack_layout::layout_stack;
+use alloc::borrow::Cow;
 use alloc::vec::Vec;
 use core::i32;
 use target_lexicon::{PointerWidth, Triple};
@@ -166,9 +167,117 @@ impl ArgAssigner for Args {
    }
 }

+/// Get the number of general-purpose and floating-point registers required to
+/// hold the given `AbiParam` returns.
+fn num_return_registers_required<'a>(
+    word_bit_size: u8,
+    call_conv: CallConv,
+    shared_flags: &shared_settings::Flags,
+    isa_flags: &isa_settings::Flags,
+    return_params: impl IntoIterator<Item = &'a AbiParam>,
+) -> (usize, usize) {
+    // Pretend we have "infinite" registers to give out, since we aren't
+    // actually assigning `AbiParam`s to registers yet, just seeing how many
+    // registers we would need in order to fit all the `AbiParam`s in registers.
+    let gprs = &[RU::rax; 128];
+    let fpr_limit = std::usize::MAX;
+
+    let mut assigner = Args::new(
+        word_bit_size,
+        gprs,
+        fpr_limit,
+        call_conv,
+        shared_flags,
+        isa_flags,
+    );
+
+    let mut gprs_required = 0;
+    let mut fprs_required = 0;
+
+    for param in return_params {
+        match param.location {
+            ArgumentLoc::Unassigned => {
+                // Let this fall through so that we assign it a location and
+                // account for how many registers it ends up requiring below...
+            }
+            ArgumentLoc::Reg(_) => {
+                // This is already assigned to a register. Count it.
+                if param.value_type.is_float() {
+                    fprs_required += 1;
+                } else {
+                    gprs_required += 1;
+                }
+                continue;
+            }
+            _ => {
+                // It is already assigned, but not to a register. Skip it.
+                continue;
+            }
+        }
+
+        // We're going to mutate the type as it gets converted, so make our own
+        // copy that isn't visible to the outside world.
+        let mut param = param.clone();
+
+        let mut split_factor = 1;
+
+        loop {
+            match assigner.assign(&param) {
+                ArgAction::Convert(ValueConversion::IntSplit) => {
+                    split_factor *= 2;
+                    param.value_type = param.value_type.half_width().unwrap();
+                }
+                ArgAction::Convert(ValueConversion::VectorSplit) => {
+                    split_factor *= 2;
+                    param.value_type = param.value_type.half_vector().unwrap();
+                }
+                ArgAction::Assign(ArgumentLoc::Reg(_))
+                | ArgAction::Convert(ValueConversion::IntBits)
+                | ArgAction::Convert(ValueConversion::Sext(_))
+                | ArgAction::Convert(ValueConversion::Uext(_)) => {
+                    // Ok! We can fit this (potentially split) value into a
+                    // register! Add the number of params we split the parameter
+                    // into to our current counts.
+                    if param.value_type.is_float() {
+                        fprs_required += split_factor;
+                    } else {
+                        gprs_required += split_factor;
+                    }
+
+                    // But we also have to call `assign` once for each split value, to
+                    // update `assigner`'s internal state.
+                    for _ in 1..split_factor {
+                        match assigner.assign(&param) {
+                            ArgAction::Assign(_)
+                            | ArgAction::Convert(ValueConversion::IntBits)
+                            | ArgAction::Convert(ValueConversion::Sext(_))
+                            | ArgAction::Convert(ValueConversion::Uext(_)) => {
+                                continue;
+                            }
+                            otherwise => panic!(
+                                "unexpected action after first split succeeded: {:?}",
+                                otherwise
+                            ),
+                        }
+                    }
+
+                    // Continue to the next param.
+                    break;
+                }
+                ArgAction::Assign(loc) => panic!(
+                    "unexpected location assignment, should have had enough registers: {:?}",
+                    loc
+                ),
+            }
+        }
+    }
+
+    (gprs_required, fprs_required)
+}
+
 /// Legalize `sig`.
 pub fn legalize_signature(
-    sig: &mut ir::Signature,
+    sig: &mut Cow<ir::Signature>,
    triple: &Triple,
    _current: bool,
    shared_flags: &shared_settings::Flags,
@@ -207,9 +316,7 @@ pub fn legalize_signature(
        }
    }

-    legalize_args(&mut sig.params, &mut args);
-
-    let (regs, fpr_limit) = if sig.call_conv.extends_windows_fastcall() {
+    let (ret_regs, ret_fpr_limit) = if sig.call_conv.extends_windows_fastcall() {
        // windows-x64 calling convention only uses XMM0 or RAX for return values
        (&RET_GPRS_WIN_FASTCALL_X64[..], 1)
    } else {
@@ -218,13 +325,77 @@ pub fn legalize_signature(

    let mut rets = Args::new(
        bits,
-        regs,
-        fpr_limit,
+        ret_regs,
+        ret_fpr_limit,
        sig.call_conv,
        shared_flags,
        isa_flags,
    );
-    legalize_args(&mut sig.returns, &mut rets);
+
+    if sig.is_multi_return() && {
+        // Even if it is multi-return, see if the return values will fit into
+        // our available return registers.
+        let (gprs_required, fprs_required) = num_return_registers_required(
+            bits,
+            sig.call_conv,
+            shared_flags,
+            isa_flags,
+            &sig.returns,
+        );
+        gprs_required > ret_regs.len() || fprs_required > ret_fpr_limit
+    } {
+        debug_assert!(!sig.uses_struct_return_param());
+
+        // We're using the first register for the return pointer parameter.
+        let mut ret_ptr_param = AbiParam {
+            value_type: args.pointer_type,
+            purpose: ArgumentPurpose::StructReturn,
+            extension: ArgumentExtension::None,
+            location: ArgumentLoc::Unassigned,
+        };
+        match args.assign(&ret_ptr_param) {
+            ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
+                ret_ptr_param.location = ArgumentLoc::Reg(reg);
+                sig.to_mut().params.push(ret_ptr_param);
+            }
+            _ => unreachable!("return pointer should always get a register assignment"),
+        }
+
+        // We're using the first return register for the return pointer (like
+        // sys v does).
+        let mut ret_ptr_return = AbiParam {
+            value_type: args.pointer_type,
+            purpose: ArgumentPurpose::StructReturn,
+            extension: ArgumentExtension::None,
+            location: ArgumentLoc::Unassigned,
+        };
+        match rets.assign(&ret_ptr_return) {
+            ArgAction::Assign(ArgumentLoc::Reg(reg)) => {
+                ret_ptr_return.location = ArgumentLoc::Reg(reg);
+                sig.to_mut().returns.push(ret_ptr_return);
+            }
+            _ => unreachable!("return pointer should always get a register assignment"),
+        }
+
+        sig.to_mut().returns.retain(|ret| {
+            // Either this is the return pointer, in which case we want to keep
+            // it, or else assume that it is assigned for a reason and doesn't
+            // conflict with our return pointering legalization.
+            debug_assert_eq!(
+                ret.location.is_assigned(),
+                ret.purpose != ArgumentPurpose::Normal
+            );
+            ret.location.is_assigned()
+        });
+    }
+
+    if let Some(new_params) = legalize_args(&sig.params, &mut args) {
+        sig.to_mut().params = new_params;
+    }
+
+    if let Some(new_returns) = legalize_args(&sig.returns, &mut rets) {
+        sig.to_mut().returns = new_returns;
+    }
 }

 /// Get register class for a type appearing in a legalized signature.
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -18,6 +18,7 @@ use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::timing;
+use alloc::borrow::Cow;
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::fmt;
@@ -107,7 +108,7 @@ impl TargetIsa for Isa {
        )
    }

-    fn legalize_signature(&self, sig: &mut ir::Signature, current: bool) {
+    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
        abi::legalize_signature(
            sig,
            &self.triple,
--- a/cranelift/codegen/src/legalizer/boundary.rs
+++ b/cranelift/codegen/src/legalizer/boundary.rs
@@ -23,11 +23,14 @@ use crate::flowgraph::ControlFlowGraph;
 use crate::ir::instructions::CallInfo;
 use crate::ir::{
    AbiParam, ArgumentLoc, ArgumentPurpose, DataFlowGraph, Ebb, Function, Inst, InstBuilder,
-    SigRef, Signature, Type, Value, ValueLoc,
+    MemFlags, SigRef, Signature, StackSlotData, StackSlotKind, Type, Value, ValueLoc,
 };
 use crate::isa::TargetIsa;
 use crate::legalizer::split::{isplit, vsplit};
+use alloc::borrow::Cow;
 use alloc::vec::Vec;
+use core::mem;
+use cranelift_entity::EntityList;
 use log::debug;

 /// Legalize all the function signatures in `func`.
@@ -36,9 +39,16 @@ use log::debug;
 /// change the entry block arguments, calls, or return instructions, so this can leave the function
 /// in a state with type discrepancies.
 pub fn legalize_signatures(func: &mut Function, isa: &dyn TargetIsa) {
-    legalize_signature(&mut func.signature, true, isa);
-    for sig_data in func.dfg.signatures.values_mut() {
-        legalize_signature(sig_data, false, isa);
+    if let Some(new) = legalize_signature(&func.signature, true, isa) {
+        let old = mem::replace(&mut func.signature, new);
+        func.old_signature = Some(old);
+    }
+
+    for (sig_ref, sig_data) in func.dfg.signatures.iter_mut() {
+        if let Some(new) = legalize_signature(sig_data, false, isa) {
+            let old = mem::replace(sig_data, new);
+            func.dfg.old_signatures[sig_ref] = Some(old);
+        }
    }

    if let Some(entry) = func.layout.entry_block() {
@@ -50,14 +60,25 @@ pub fn legalize_signatures(func: &mut Function, isa: &dyn TargetIsa) {
 /// Legalize the libcall signature, which we may generate on the fly after
 /// `legalize_signatures` has been called.
 pub fn legalize_libcall_signature(signature: &mut Signature, isa: &dyn TargetIsa) {
-    legalize_signature(signature, false, isa);
+    if let Some(s) = legalize_signature(signature, false, isa) {
+        *signature = s;
+    }
 }

 /// Legalize the given signature.
 ///
 /// `current` is true if this is the signature for the current function.
-fn legalize_signature(signature: &mut Signature, current: bool, isa: &dyn TargetIsa) {
-    isa.legalize_signature(signature, current);
+fn legalize_signature(
+    signature: &Signature,
+    current: bool,
+    isa: &dyn TargetIsa,
+) -> Option<Signature> {
+    let mut cow = Cow::Borrowed(signature);
+    isa.legalize_signature(&mut cow, current);
+    match cow {
+        Cow::Borrowed(_) => None,
+        Cow::Owned(s) => Some(s),
+    }
 }

 /// Legalize the entry block parameters after `func`'s signature has been legalized.
@@ -245,6 +266,166 @@ where
    call
 }

+fn assert_is_valid_sret_legalization(
+    old_ret_list: &EntityList<Value>,
+    old_sig: &Signature,
+    new_sig: &Signature,
+    pos: &FuncCursor,
+) {
+    debug_assert_eq!(
+        old_sig.returns.len(),
+        old_ret_list.len(&pos.func.dfg.value_lists)
+    );
+
+    // Assert that the only difference in special parameters is that there
+    // is an appended struct return pointer parameter.
+    let old_special_params: Vec<_> = old_sig
+        .params
+        .iter()
+        .filter(|r| r.purpose != ArgumentPurpose::Normal)
+        .collect();
+    let new_special_params: Vec<_> = new_sig
+        .params
+        .iter()
+        .filter(|r| r.purpose != ArgumentPurpose::Normal)
+        .collect();
+    debug_assert_eq!(old_special_params.len() + 1, new_special_params.len());
+    debug_assert!(old_special_params
+        .iter()
+        .zip(&new_special_params)
+        .all(|(old, new)| old.purpose == new.purpose));
+    debug_assert_eq!(
+        new_special_params.last().unwrap().purpose,
+        ArgumentPurpose::StructReturn
+    );
+
+    // If the special returns have changed at all, then the only change
+    // should be that the struct return pointer is returned back out of the
+    // function, so that callers don't have to load its stack address again.
+    let old_special_returns: Vec<_> = old_sig
+        .returns
+        .iter()
+        .filter(|r| r.purpose != ArgumentPurpose::Normal)
+        .collect();
+    let new_special_returns: Vec<_> = new_sig
+        .returns
+        .iter()
+        .filter(|r| r.purpose != ArgumentPurpose::Normal)
+        .collect();
+    debug_assert!(old_special_returns
+        .iter()
+        .zip(&new_special_returns)
+        .all(|(old, new)| old.purpose == new.purpose));
+    debug_assert!(
+        old_special_returns.len() == new_special_returns.len()
+            || (old_special_returns.len() + 1 == new_special_returns.len()
+                && new_special_returns.last().unwrap().purpose == ArgumentPurpose::StructReturn)
+    );
+}
+
+fn legalize_sret_call(isa: &dyn TargetIsa, pos: &mut FuncCursor, sig_ref: SigRef, call: Inst) {
+    let old_ret_list = pos.func.dfg.detach_results(call);
+    let old_sig = pos.func.dfg.old_signatures[sig_ref]
+        .take()
+        .expect("must have an old signature when using an `sret` parameter");
+
+    // We make a bunch of assumptions about the shape of the old, multi-return
+    // signature and the new, sret-using signature in this legalization
+    // function. Assert that these assumptions hold true in debug mode.
+    if cfg!(debug_assertions) {
+        assert_is_valid_sret_legalization(
+            &old_ret_list,
+            &old_sig,
+            &pos.func.dfg.signatures[sig_ref],
+            &pos,
+        );
+    }
+
+    // Go through and remove all normal return values from the `call`
+    // instruction's returns list. These will be stored into the stack slot that
+    // the sret points to. At the same time, calculate the size of the sret
+    // stack slot.
+    let mut sret_slot_size = 0;
+    for (i, ret) in old_sig.returns.iter().enumerate() {
+        let v = old_ret_list.get(i, &pos.func.dfg.value_lists).unwrap();
+        let ty = pos.func.dfg.value_type(v);
+        if ret.purpose == ArgumentPurpose::Normal {
+            debug_assert_eq!(ret.location, ArgumentLoc::Unassigned);
+            let ty = legalized_type_for_sret(ty);
+            let size = ty.bytes();
+            sret_slot_size = round_up_to_multiple_of_type_align(sret_slot_size, ty) + size;
+        } else {
+            let new_v = pos.func.dfg.append_result(call, ty);
+            pos.func.dfg.change_to_alias(v, new_v);
+        }
+    }
+
+    let stack_slot = pos.func.stack_slots.push(StackSlotData {
+        kind: StackSlotKind::StructReturnSlot,
+        size: sret_slot_size,
+        offset: None,
+    });
+
+    // Append the sret pointer to the `call` instruction's arguments.
+    let ptr_type = Type::triple_pointer_type(isa.triple());
+    let sret_arg = pos.ins().stack_addr(ptr_type, stack_slot, 0);
+    pos.func.dfg.append_inst_arg(call, sret_arg);
+
+    // The sret pointer might be returned by the signature as well. If so, we
+    // need to add it to the `call` instruction's results list.
+    //
+    // Additionally, when the sret is explicitly returned in this calling
+    // convention, then use it when loading the sret returns back into ssa
+    // values to avoid keeping the original `sret_arg` live and potentially
+    // having to do spills and fills.
+    let sret =
+        if pos.func.dfg.signatures[sig_ref].uses_special_return(ArgumentPurpose::StructReturn) {
+            pos.func.dfg.append_result(call, ptr_type)
+        } else {
+            sret_arg
+        };
+
+    // Finally, load each of the call's return values out of the sret stack
+    // slot.
+    pos.goto_after_inst(call);
+    let mut offset = 0;
+    for i in 0..old_ret_list.len(&pos.func.dfg.value_lists) {
+        if old_sig.returns[i].purpose != ArgumentPurpose::Normal {
+            continue;
+        }
+
+        let old_v = old_ret_list.get(i, &pos.func.dfg.value_lists).unwrap();
+        let ty = pos.func.dfg.value_type(old_v);
+        let mut legalized_ty = legalized_type_for_sret(ty);
+
+        offset = round_up_to_multiple_of_type_align(offset, legalized_ty);
+
+        let new_legalized_v =
+            pos.ins()
+                .load(legalized_ty, MemFlags::trusted(), sret, offset as i32);
+
+        // "Illegalize" the loaded value from the legalized type back to its
+        // original `ty`. This is basically the opposite of
+        // `legalize_type_for_sret_store`.
+        let mut new_v = new_legalized_v;
+        if ty.is_bool() {
+            legalized_ty = legalized_ty.as_bool_pedantic();
+            new_v = pos.ins().raw_bitcast(legalized_ty, new_v);
+
+            if ty.bits() < legalized_ty.bits() {
+                legalized_ty = ty;
+                new_v = pos.ins().breduce(legalized_ty, new_v);
+            }
+        }
+
+        pos.func.dfg.change_to_alias(old_v, new_v);
+
+        offset += legalized_ty.bytes();
+    }
+
+    pos.func.dfg.old_signatures[sig_ref] = Some(old_sig);
+}
+
 /// Compute original value of type `ty` from the legalized ABI arguments.
 ///
 /// The conversion is recursive, controlled by the `get_arg` closure which is called to retrieve an
@@ -452,6 +633,13 @@ fn legalize_inst_arguments<ArgType>(
        .constraints()
        .num_fixed_value_arguments();
    let have_args = vlist.len(&pos.func.dfg.value_lists) - num_fixed_values;
+    if abi_args < have_args {
+        // This happens with multiple return values after we've legalized the
+        // signature but haven't legalized the return instruction yet. This
+        // legalization is handled in `handle_return_abi`.
+        pos.func.dfg[inst].put_value_list(vlist);
+        return;
+    }

    // Grow the value list to the right size and shift all the existing arguments to the right.
    // This lets us write the new argument values into the list without overwriting the old
@@ -508,6 +696,32 @@ fn legalize_inst_arguments<ArgType>(
    pos.func.dfg[inst].put_value_list(vlist);
 }

+/// Ensure that the `ty` being returned is a type that can be loaded and stored
+/// (potentially after another narrowing legalization) from memory, since it
+/// will go into the `sret` space.
+fn legalized_type_for_sret(ty: Type) -> Type {
+    if ty.is_bool() {
+        let bits = std::cmp::max(8, ty.bits());
+        Type::int(bits).unwrap()
+    } else {
+        ty
+    }
+}
+
+/// Insert any legalization code required to ensure that `val` can be stored
+/// into the `sret` memory. Returns the (potentially new, potentially
+/// unmodified) legalized value and its type.
+fn legalize_type_for_sret_store(pos: &mut FuncCursor, val: Value, ty: Type) -> (Value, Type) {
+    if ty.is_bool() {
+        let bits = std::cmp::max(8, ty.bits());
+        let ty = Type::int(bits).unwrap();
+        let val = pos.ins().bint(ty, val);
+        (val, ty)
+    } else {
+        (val, ty)
+    }
+}
+
 /// Insert ABI conversion code before and after the call instruction at `pos`.
 ///
 /// Instructions inserted before the call will compute the appropriate ABI values for the
@@ -518,7 +732,12 @@ fn legalize_inst_arguments<ArgType>(
 /// original return values. The call's result values will be adapted to match the new signature.
 ///
 /// Returns `true` if any instructions were inserted.
-pub fn handle_call_abi(mut inst: Inst, func: &mut Function, cfg: &ControlFlowGraph) -> bool {
+pub fn handle_call_abi(
+    isa: &dyn TargetIsa,
+    mut inst: Inst,
+    func: &mut Function,
+    cfg: &ControlFlowGraph,
+) -> bool {
    let pos = &mut FuncCursor::new(func).at_inst(inst);
    pos.use_srcloc(inst);

@@ -528,16 +747,27 @@ pub fn handle_call_abi(mut inst: Inst, func: &mut Function, cfg: &ControlFlowGra
        Err(s) => s,
    };

-    // OK, we need to fix the call arguments to match the ABI signature.
-    let abi_args = pos.func.dfg.signatures[sig_ref].params.len();
-    legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
-        func.dfg.signatures[sig_ref].params[abi_arg]
-    });
+    let sig = &pos.func.dfg.signatures[sig_ref];
+    let old_sig = &pos.func.dfg.old_signatures[sig_ref];

-    if !pos.func.dfg.signatures[sig_ref].returns.is_empty() {
-        inst = legalize_inst_results(pos, |func, abi_res| {
-            func.dfg.signatures[sig_ref].returns[abi_res]
+    if sig.uses_struct_return_param()
+        && old_sig
+            .as_ref()
+            .map_or(false, |s| !s.uses_struct_return_param())
+    {
+        legalize_sret_call(isa, pos, sig_ref, inst);
+    } else {
+        // OK, we need to fix the call arguments to match the ABI signature.
+        let abi_args = pos.func.dfg.signatures[sig_ref].params.len();
+        legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
+            func.dfg.signatures[sig_ref].params[abi_arg]
        });
+
+        if !pos.func.dfg.signatures[sig_ref].returns.is_empty() {
+            inst = legalize_inst_results(pos, |func, abi_res| {
+                func.dfg.signatures[sig_ref].returns[abi_res]
+            });
+        }
    }

    debug_assert!(
@@ -586,8 +816,6 @@ pub fn handle_return_abi(inst: Inst, func: &mut Function, cfg: &ControlFlowGraph
    legalize_inst_arguments(pos, cfg, abi_args, |func, abi_arg| {
        func.signature.returns[abi_arg]
    });
-    debug_assert_eq!(pos.func.dfg.inst_variable_args(inst).len(), abi_args);
-
    // Append special return arguments for any `sret`, `link`, and `vmctx` return values added to
    // the legalized signature. These values should simply be propagated from the entry block
    // arguments.
@@ -598,6 +826,8 @@ pub fn handle_return_abi(inst: Inst, func: &mut Function, cfg: &ControlFlowGraph
            pos.func.dfg.display_inst(inst, None)
        );
        let mut vlist = pos.func.dfg[inst].take_value_list().unwrap();
+        let mut sret = None;
+
        for arg in &pos.func.signature.returns[abi_args..] {
            match arg.purpose {
                ArgumentPurpose::Link
@@ -624,10 +854,45 @@ pub fn handle_return_abi(inst: Inst, func: &mut Function, cfg: &ControlFlowGraph
                .ebb_params(pos.func.layout.entry_block().unwrap())[idx];
            debug_assert_eq!(pos.func.dfg.value_type(val), arg.value_type);
            vlist.push(val, &mut pos.func.dfg.value_lists);
+
+            if let ArgumentPurpose::StructReturn = arg.purpose {
+                sret = Some(val);
+            }
+        }
+
+        // Store all the regular returns into the retptr space and remove them
+        // from the `return` instruction's value list.
+        if let Some(sret) = sret {
+            let mut offset = 0;
+            let num_regular_rets = vlist.len(&pos.func.dfg.value_lists) - special_args;
+            for i in 0..num_regular_rets {
+                debug_assert_eq!(
+                    pos.func.old_signature.as_ref().unwrap().returns[i].purpose,
+                    ArgumentPurpose::Normal,
+                );
+
+                // The next return value to process is always at `0`, since the
+                // list is emptied as we iterate.
+                let v = vlist.get(0, &pos.func.dfg.value_lists).unwrap();
+                let ty = pos.func.dfg.value_type(v);
+                let (v, ty) = legalize_type_for_sret_store(pos, v, ty);
+
+                let size = ty.bytes();
+                offset = round_up_to_multiple_of_type_align(offset, ty);
+
+                pos.ins().store(MemFlags::trusted(), v, sret, offset as i32);
+                vlist.remove(0, &mut pos.func.dfg.value_lists);
+
+                offset += size;
+            }
        }
        pos.func.dfg[inst].put_value_list(vlist);
    }

+    debug_assert_eq!(
+        pos.func.dfg.inst_variable_args(inst).len(),
+        abi_args + special_args
+    );
    debug_assert!(
        check_return_signature(&pos.func.dfg, inst, &pos.func.signature),
        "Signature still wrong: {} / signature {}",
@@ -639,6 +904,56 @@ pub fn handle_return_abi(inst: Inst, func: &mut Function, cfg: &ControlFlowGraph
    true
 }

+fn round_up_to_multiple_of_type_align(bytes: u32, ty: Type) -> u32 {
+    // We don't have a dedicated alignment for types, so assume they are
+    // size-aligned.
+    let align = ty.bytes();
+    round_up_to_multiple_of_pow2(bytes, align)
+}
+
+/// Round `n` up to the next multiple of `to` that is greater than or equal to
+/// `n`.
+///
+/// `to` must be a power of two and greater than zero.
+///
+/// This is useful for rounding an offset or pointer up to some type's required
+/// alignment.
+fn round_up_to_multiple_of_pow2(n: u32, to: u32) -> u32 {
+    debug_assert!(to > 0);
+    debug_assert!(to.is_power_of_two());
+
+    // The simple version of this function is
+    //
+    //     (n + to - 1) / to * to
+    //
+    // Consider the numerator: `n + to - 1`. This is ensuring that if there is
+    // any remainder for `n / to`, then the result of the division is one
+    // greater than `n / to`, and that otherwise we get exactly the same result
+    // as `n / to` due to integer division rounding off the remainder. In other
+    // words, we only round up if `n` is not aligned to `to`.
+    //
+    // However, we know `to` is a power of two, and therefore `anything / to` is
+    // equivalent to `anything >> log2(to)` and `anything * to` is equivalent to
+    // `anything << log2(to)`. We can therefore rewrite our simplified function
+    // into the following:
+    //
+    //     (n + to - 1) >> log2(to) << log2(to)
+    //
+    // But shifting a value right by some number of bits `b` and then shifting
+    // it left by that same number of bits `b` is equivalent to clearing the
+    // bottom `b` bits of the number. We can clear the bottom `b` bits of a
+    // number by bit-wise and'ing the number with the bit-wise not of `2^b - 1`.
+    // Plugging this into our function and simplifying, we get:
+    //
+    //       (n + to - 1) >> log2(to) << log2(to)
+    //     = (n + to - 1) & !(2^log2(to) - 1)
+    //     = (n + to - 1) & !(to - 1)
+    //
+    // And now we have the final version of this function!
+
+    (n + to - 1) & !(to - 1)
+}
+
 /// Assign stack slots to incoming function parameters on the stack.
 ///
 /// Values that are passed into the function on the stack must be assigned to an `IncomingArg`
@@ -714,3 +1029,34 @@ fn spill_call_arguments(pos: &mut FuncCursor) -> bool {
    // We changed stuff.
    true
 }
+
+#[cfg(test)]
+mod tests {
+    use super::round_up_to_multiple_of_pow2;
+
+    #[test]
+    fn round_up_to_multiple_of_pow2_works() {
+        for (n, to, expected) in vec![
+            (0, 1, 0),
+            (1, 1, 1),
+            (2, 1, 2),
+            (0, 2, 0),
+            (1, 2, 2),
+            (2, 2, 2),
+            (3, 2, 4),
+            (0, 4, 0),
+            (1, 4, 4),
+            (2, 4, 4),
+            (3, 4, 4),
+            (4, 4, 4),
+            (5, 4, 8),
+        ] {
+            let actual = round_up_to_multiple_of_pow2(n, to);
+            assert_eq!(
+                actual, expected,
+                "round_up_to_multiple_of_pow2(n = {}, to = {}) = {} (expected {})",
+                n, to, actual, expected
+            );
+        }
+    }
+}
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -55,7 +55,7 @@ fn legalize_inst(

    // Check for ABI boundaries that need to be converted to the legalized signature.
    if opcode.is_call() {
-        if boundary::handle_call_abi(inst, pos.func, cfg) {
+        if boundary::handle_call_abi(isa, inst, pos.func, cfg) {
            return LegalizeInstResult::Legalized;
        }
    } else if opcode.is_return() {
--- a/cranelift/codegen/src/stack_layout.rs
+++ b/cranelift/codegen/src/stack_layout.rs
@@ -25,7 +25,7 @@ pub fn layout_stack(frame: &mut StackSlots, alignment: StackSize) -> CodegenResu
    // stack layout from high to low addresses will be:
    //
    // 1. incoming arguments.
-    // 2. spills + explicits.
+    // 2. spills + explicits + struct returns.
    // 3. outgoing arguments.
    //
    // The incoming arguments can have both positive and negative offsets. A negative offset
@@ -56,7 +56,8 @@ pub fn layout_stack(frame: &mut StackSlots, alignment: StackSize) -> CodegenResu
                    .ok_or(CodegenError::ImplLimitExceeded)?;
                outgoing_max = max(outgoing_max, offset);
            }
-            StackSlotKind::SpillSlot
+            StackSlotKind::StructReturnSlot
+            | StackSlotKind::SpillSlot
            | StackSlotKind::ExplicitSlot
            | StackSlotKind::EmergencySlot => {
                // Determine the smallest alignment of any explicit or spill slot.
@@ -65,9 +66,9 @@ pub fn layout_stack(frame: &mut StackSlots, alignment: StackSize) -> CodegenResu
        }
    }

-    // Lay out spill slots and explicit slots below the incoming arguments.
-    // The offset is negative, growing downwards.
-    // Start with the smallest alignments for better packing.
+    // Lay out spill slots, struct return slots, and explicit slots below the
+    // incoming arguments. The offset is negative, growing downwards. Start with
+    // the smallest alignments for better packing.
    let mut offset = incoming_min;
    debug_assert!(min_align.is_power_of_two());
    while min_align <= alignment {
@@ -75,6 +76,7 @@ pub fn layout_stack(frame: &mut StackSlots, alignment: StackSize) -> CodegenResu
            // Pick out explicit and spill slots with exact alignment `min_align`.
            match slot.kind {
                StackSlotKind::SpillSlot
+                | StackSlotKind::StructReturnSlot
                | StackSlotKind::ExplicitSlot
                | StackSlotKind::EmergencySlot => {
                    if slot.alignment(alignment) != min_align {
--- a/cranelift/filetests/filetests/wasm/multi-val-b1.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-b1.clif
@@ -0,0 +1,68 @@
+test compile
+target x86_64 haswell
+
+;; `b1` return values need to be legalized into bytes so that they can be stored
+;; in memory.
+
+function %return_4_b1s(b1, b1, b1, b1) -> b1, b1, b1, b1 {
+;; check: function %return_4_b1s(b1 [%rsi], b1 [%rdx], b1 [%rcx], b1 [%r8], i64 sret [%rdi], i64 fp [%rbp]) -> i64 sret [%rax], i64 fp [%rbp] fast {
+
+ebb0(v0: b1, v1: b1, v2: b1, v3: b1):
+; check: ebb0(v0: b1 [%rsi], v1: b1 [%rdx], v2: b1 [%rcx], v3: b1 [%r8], v4: i64 [%rdi], v13: i64 [%rbp]):
+
+    return v0, v1, v2, v3
+    ; check:  v5 = bint.i8 v0
+    ; nextln: v9 = uextend.i32 v5
+    ; nextln: istore8 notrap aligned v9, v4
+    ; nextln: v6 = bint.i8 v1
+    ; nextln: v10 = uextend.i32 v6
+    ; nextln: istore8 notrap aligned v10, v4+1
+    ; nextln: v7 = bint.i8 v2
+    ; nextln: v11 = uextend.i32 v7
+    ; nextln: istore8 notrap aligned v11, v4+2
+    ; nextln: v8 = bint.i8 v3
+    ; nextln: v12 = uextend.i32 v8
+    ; nextln: istore8 notrap aligned v12, v4+3
+}
+
+function %call_4_b1s() {
+; check: function %call_4_b1s(i64 fp [%rbp], i64 csr [%rbx]) -> i64 fp [%rbp], i64 csr [%rbx] fast {
+; nextln:    ss0 = sret_slot 4, offset -28
+
+    fn0 = colocated %return_4_b1s(b1, b1, b1, b1) -> b1, b1, b1, b1
+    ; check: sig0 = (b1 [%rsi], b1 [%rdx], b1 [%rcx], b1 [%r8], i64 sret [%rdi]) -> i64 sret [%rax] fast
+
+ebb0:
+; check: ebb0(v26: i64 [%rbp], v27: i64 [%rbx]):
+
+    v0 = bconst.b1 true
+    v1 = bconst.b1 false
+    v2 = bconst.b1 true
+    v3 = bconst.b1 false
+
+    ; check: v8 = stack_addr.i64 ss0
+    v4, v5, v6, v7 = call fn0(v0, v1, v2, v3)
+    ; check:  v9 = call fn0(v0, v1, v2, v3, v8)
+    ; nextln: v22 = uload8.i32 notrap aligned v9
+    ; nextln: v10 = ireduce.i8 v22
+    ; nextln: v11 = raw_bitcast.b8 v10
+    ; nextln: v12 = breduce.b1 v11
+    ; nextln: v4 -> v12
+    ; nextln: v23 = uload8.i32 notrap aligned v9+1
+    ; nextln: v13 = ireduce.i8 v23
+    ; nextln: v14 = raw_bitcast.b8 v13
+    ; nextln: v15 = breduce.b1 v14
+    ; nextln: v5 -> v15
+    ; nextln: v24 = uload8.i32 notrap aligned v9+2
+    ; nextln: v16 = ireduce.i8 v24
+    ; nextln: v17 = raw_bitcast.b8 v16
+    ; nextln: v18 = breduce.b1 v17
+    ; nextln: v6 -> v18
+    ; nextln: v25 = uload8.i32 notrap aligned v9+3
+    ; nextln: v19 = ireduce.i8 v25
+    ; nextln: v20 = raw_bitcast.b8 v19
+    ; nextln: v21 = breduce.b1 v20
+    ; nextln: v7 -> v21
+
+    return
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-call-indirect.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-call-indirect.clif
@@ -0,0 +1,26 @@
+test legalizer
+target x86_64 haswell
+
+;; Indirect calls with many returns.
+
+function %call_indirect_many_rets(i64) {
+    ; check: ss0 = sret_slot 32
+
+    sig0 = () -> i64, i64, i64, i64
+    ; check: sig0 = (i64 sret [%rdi]) -> i64 sret [%rax] fast
+
+ebb0(v0: i64):
+    v1, v2, v3, v4 = call_indirect sig0, v0()
+    ; check:  v5 = stack_addr.i64 ss0
+    ; nextln: v6 = call_indirect sig0, v0(v5)
+    ; nextln: v7 = load.i64 notrap aligned v6
+    ; nextln: v1 -> v7
+    ; nextln: v8 = load.i64 notrap aligned v6+8
+    ; nextln: v2 -> v8
+    ; nextln: v9 = load.i64 notrap aligned v6+16
+    ; nextln: v3 -> v9
+    ; nextln: v10 = load.i64 notrap aligned v6+24
+    ; nextln: v4 -> v10
+
+    return
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-f32.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-f32.clif
@@ -0,0 +1,44 @@
+test compile
+target x86_64 haswell
+
+;; Returning many f32s
+
+function %return_2_f32s() -> f32, f32 {
+ebb0:
+    v0 = f32const 0x0.0
+    v1 = f32const 0x1.0
+    return v0, v1
+}
+
+function %return_3_f32s() -> f32, f32, f32 {
+ebb0:
+    v0 = f32const 0x0.0
+    v1 = f32const 0x1.0
+    v2 = f32const 0x2.0
+    return v0, v1, v2
+}
+
+function %return_4_f32s() -> f32, f32, f32, f32 {
+ebb0:
+    v0 = f32const 0x0.0
+    v1 = f32const 0x1.0
+    v2 = f32const 0x2.0
+    v3 = f32const 0x3.0
+    return v0, v1, v2, v3
+}
+
+;; Calling functions that return many f32s
+
+function %call() -> f32 {
+    fn0 = %a() -> f32, f32
+    fn1 = %b(f32, f32) -> f32, f32, f32
+    fn2 = %c(f32, f32, f32) -> f32, f32, f32, f32
+ebb0:
+    v0, v1 = call fn0()
+    v2, v3, v4 = call fn1(v0, v1)
+    v5, v6, v7, v8 = call fn2(v2, v3, v4)
+    v9 = fadd v5, v6
+    v10 = fadd v7, v8
+    v11 = fadd v9, v10
+    return v11
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-f64.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-f64.clif
@@ -0,0 +1,44 @@
+test compile
+target x86_64 haswell
+
+;; Returning many f64s
+
+function %return_2_f64s() -> f64, f64 {
+ebb0:
+    v0 = f64const 0x0.0
+    v1 = f64const 0x1.0
+    return v0, v1
+}
+
+function %return_3_f64s() -> f64, f64, f64 {
+ebb0:
+    v0 = f64const 0x0.0
+    v1 = f64const 0x1.0
+    v2 = f64const 0x2.0
+    return v0, v1, v2
+}
+
+function %return_4_f64s() -> f64, f64, f64, f64 {
+ebb0:
+    v0 = f64const 0x0.0
+    v1 = f64const 0x1.0
+    v2 = f64const 0x2.0
+    v3 = f64const 0x3.0
+    return v0, v1, v2, v3
+}
+
+;; Calling functions that return many f64s
+
+function %call() -> f64 {
+    fn0 = %a() -> f64, f64
+    fn1 = %b(f64, f64) -> f64, f64, f64
+    fn2 = %c(f64, f64, f64) -> f64, f64, f64, f64
+ebb0:
+    v0, v1 = call fn0()
+    v2, v3, v4 = call fn1(v0, v1)
+    v5, v6, v7, v8 = call fn2(v2, v3, v4)
+    v9 = fadd v5, v6
+    v10 = fadd v7, v8
+    v11 = fadd v9, v10
+    return v11
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-i32.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-i32.clif
@@ -0,0 +1,44 @@
+test compile
+target x86_64 haswell
+
+;; Returning many i32s
+
+function %return_2_i32s() -> i32, i32 {
+ebb0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    return v0, v1
+}
+
+function %return_3_i32s() -> i32, i32, i32 {
+ebb0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    v2 = iconst.i32 2
+    return v0, v1, v2
+}
+
+function %return_4_i32s() -> i32, i32, i32, i32 {
+ebb0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    v2 = iconst.i32 2
+    v3 = iconst.i32 3
+    return v0, v1, v2, v3
+}
+
+;; Calling functions that return many i32s
+
+function %call() -> i32 {
+    fn0 = %a() -> i32, i32
+    fn1 = %b(i32, i32) -> i32, i32, i32
+    fn2 = %c(i32, i32, i32) -> i32, i32, i32, i32
+ebb0:
+    v0, v1 = call fn0()
+    v2, v3, v4 = call fn1(v0, v1)
+    v5, v6, v7, v8 = call fn2(v2, v3, v4)
+    v9 = iadd v5, v6
+    v10 = iadd v7, v8
+    v11 = iadd v9, v10
+    return v11
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-i64.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-i64.clif
@@ -0,0 +1,44 @@
+test compile
+target x86_64 haswell
+
+;; Returning many i64s
+
+function %return_2_i64s() -> i64, i64 {
+ebb0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    return v0, v1
+}
+
+function %return_3_i64s() -> i64, i64, i64 {
+ebb0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    v2 = iconst.i64 2
+    return v0, v1, v2
+}
+
+function %return_4_i64s() -> i64, i64, i64, i64 {
+ebb0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    v2 = iconst.i64 2
+    v3 = iconst.i64 3
+    return v0, v1, v2, v3
+}
+
+;; Calling functions that return many i64s
+
+function %call() -> i64 {
+    fn0 = %a() -> i64, i64
+    fn1 = %b(i64, i64) -> i64, i64, i64
+    fn2 = %c(i64, i64, i64) -> i64, i64, i64, i64
+ebb0:
+    v0, v1 = call fn0()
+    v2, v3, v4 = call fn1(v0, v1)
+    v5, v6, v7, v8 = call fn2(v2, v3, v4)
+    v9 = iadd v5, v6
+    v10 = iadd v7, v8
+    v11 = iadd v9, v10
+    return v11
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-mixed.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-mixed.clif
--- a/cranelift/filetests/filetests/wasm/multi-val-reuse-ret-ptr-stack-slot.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-reuse-ret-ptr-stack-slot.clif
@@ -0,0 +1,61 @@
+test legalizer
+target x86_64 haswell
+
+;; Test that we don't reuse `sret` stack slots for multiple calls. We could do
+;; this one day, but it would require some care to ensure that we don't have
+;; subsequent calls overwrite the results of previous calls.
+
+function %foo() -> i32, f32 {
+    ; check:  ss0 = sret_slot 20
+    ; nextln: ss1 = sret_slot 20
+
+    fn0 = %f() -> i32, i32, i32, i32, i32
+    fn1 = %g() -> f32, f32, f32, f32, f32
+    ; check:  sig0 = (i64 sret [%rdi]) -> i64 sret [%rax] fast
+    ; nextln: sig1 = (i64 sret [%rdi]) -> i64 sret [%rax] fast
+    ; nextln: fn0 = %f sig0
+    ; nextln: fn1 = %g sig1
+
+ebb0:
+    v0, v1, v2, v3, v4 = call fn0()
+    ; check:  v18 = stack_addr.i64 ss0
+    ; nextln: v25 = func_addr.i64 fn0
+    ; nextln: v19 = call_indirect sig0, v25(v18)
+    ; nextln: v20 = load.i32 notrap aligned v19
+    ; nextln: v0 -> v20
+    ; nextln: v21 = load.i32 notrap aligned v19+4
+    ; nextln: v1 -> v21
+    ; nextln: v22 = load.i32 notrap aligned v19+8
+    ; nextln: v2 -> v22
+    ; nextln: v23 = load.i32 notrap aligned v19+12
+    ; nextln: v3 -> v23
+    ; nextln: v24 = load.i32 notrap aligned v19+16
+    ; nextln: v4 -> v24
+
+    v5, v6, v7, v8, v9 = call fn1()
+    ; check:  v26 = stack_addr.i64 ss1
+    ; nextln: v33 = func_addr.i64 fn1
+    ; nextln: v27 = call_indirect sig1, v33(v26)
+    ; nextln: v28 = load.f32 notrap aligned v27
+    ; nextln: v5 -> v28
+    ; nextln: v29 = load.f32 notrap aligned v27+4
+    ; nextln: v6 -> v29
+    ; nextln: v30 = load.f32 notrap aligned v27+8
+    ; nextln: v7 -> v30
+    ; nextln: v31 = load.f32 notrap aligned v27+12
+    ; nextln: v8 -> v31
+    ; nextln: v32 = load.f32 notrap aligned v27+16
+    ; nextln: v9 -> v32
+
+    v10 = iadd v0, v1
+    v11 = iadd v2, v3
+    v12 = iadd v10, v11
+    v13 = iadd v12, v4
+
+    v14 = fadd v5, v6
+    v15 = fadd v7, v8
+    v16 = fadd v14, v15
+    v17 = fadd v16, v9
+
+    return v13, v17
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-sret-slot-alignment.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-sret-slot-alignment.clif
@@ -0,0 +1,51 @@
+test legalizer
+target x86_64 haswell
+
+;; Need to insert padding after the `i8`s so that the `i32` and `i64` are
+;; aligned.
+
+function %returner() -> i8, i32, i8, i64 {
+; check: function %returner(i64 sret [%rdi]) -> i64 sret [%rax] fast {
+
+ebb0:
+; check: ebb0(v4: i64):
+
+    v0 = iconst.i8 0
+    v1 = iconst.i32 1
+    v2 = iconst.i8 2
+    v3 = iconst.i64 3
+    return v0, v1, v2, v3
+    ; check:  v6 = uextend.i32 v0
+    ; nextln: istore8 notrap aligned v6, v4
+    ; nextln: store notrap aligned v1, v4+4
+    ; nextln: v7 = uextend.i32 v2
+    ; nextln: istore8 notrap aligned v7, v4+8
+    ; nextln: store notrap aligned v3, v4+16
+    ; nextln: return v4
+}
+
+function %caller() {
+    ; check:  ss0 = sret_slot 24
+
+    fn0 = %returner() -> i8, i32, i8, i64
+    ; check:  sig0 = (i64 sret [%rdi]) -> i64 sret [%rax] fast
+    ; nextln: fn0 = %returner sig0
+
+ebb0:
+    v0, v1, v2, v3 = call fn0()
+    ; check:  v4 = stack_addr.i64 ss0
+    ; nextln: v10 = func_addr.i64 fn0
+    ; nextln: v5 = call_indirect sig0, v10(v4)
+    ; nextln: v11 = uload8.i32 notrap aligned v5
+    ; nextln: v6 = ireduce.i8 v11
+    ; nextln: v0 -> v6
+    ; nextln: v7 = load.i32 notrap aligned v5+4
+    ; nextln: v1 -> v7
+    ; nextln: v12 = uload8.i32 notrap aligned v5+8
+    ; nextln: v8 = ireduce.i8 v12
+    ; nextln: v2 -> v8
+    ; nextln: v9 = load.i64 notrap aligned v5+16
+    ; nextln: v3 -> v9
+
+    return
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-take-many-and-return-many.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-take-many-and-return-many.clif
@@ -0,0 +1,18 @@
+test compile
+target x86_64 haswell
+
+function %returner(i32, i64, f32, f64) -> i32, i64, f32, f64 {
+ebb0(v0: i32, v1: i64, v2: f32, v3: f64):
+    return v0, v1, v2, v3
+}
+
+function %caller() {
+    fn0 = %returner(i32, i64, f32, f64) -> i32, i64, f32, f64
+ebb0:
+    v0 = iconst.i32 0
+    v1 = iconst.i64 1
+    v2 = f32const 0x2.0
+    v3 = f64const 0x3.0
+    v4, v5, v6, v7 = call fn0(v0, v1, v2, v3)
+    return
+}
--- a/cranelift/filetests/filetests/wasm/multi-val-tons-of-results.clif
+++ b/cranelift/filetests/filetests/wasm/multi-val-tons-of-results.clif
@@ -0,0 +1,34 @@
+test compile
+target x86_64 haswell
+
+function %return_20_i32s() -> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 {
+ebb0:
+    v0 = iconst.i32 0
+    v1 = iconst.i32 1
+    v2 = iconst.i32 2
+    v3 = iconst.i32 3
+    v4 = iconst.i32 4
+    v5 = iconst.i32 5
+    v6 = iconst.i32 6
+    v7 = iconst.i32 7
+    v8 = iconst.i32 8
+    v9 = iconst.i32 9
+    v10 = iconst.i32 10
+    v11 = iconst.i32 11
+    v12 = iconst.i32 12
+    v13 = iconst.i32 13
+    v14 = iconst.i32 14
+    v15 = iconst.i32 15
+    v16 = iconst.i32 16
+    v17 = iconst.i32 17
+    v18 = iconst.i32 18
+    v19 = iconst.i32 19
+    return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19
+}
+
+function %call_20_i32s() {
+    fn0 = %return_20_i32s() -> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32
+ebb0:
+    v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19 = call fn0()
+    return
+}