Implement the relaxed SIMD proposal (#5892)

* Initial support for the Relaxed SIMD proposal This commit adds initial scaffolding and support for the Relaxed SIMD proposal for WebAssembly. Codegen support is supported on the x64 and AArch64 backends on this time. The purpose of this commit is to get all the boilerplate out of the way in terms of plumbing through a new feature, adding tests, etc. The tests are copied from the upstream repository at this time while the WebAssembly/testsuite repository hasn't been updated. A summary of changes made in this commit are: * Lowerings for all relaxed simd opcodes have been added, currently all exhibiting deterministic behavior. This means that few lowerings are optimal on the x86 backend, but on the AArch64 backend, for example, all lowerings should be optimal. * Support is added to codegen to, eventually, conditionally generate different code based on input codegen flags. This is intended to enable codegen to more efficient instructions on x86 by default, for example, while still allowing embedders to force architecture-independent semantics and behavior. One good example of this is the `f32x4.relaxed_fmadd` instruction which when deterministic forces the `fma` instruction, but otherwise if the backend doesn't have support for `fma` then intermediate operations are performed instead. * Lowerings of `iadd_pairwise` for `i16x8` and `i32x4` were added to the x86 backend as they're now exercised by the deterministic lowerings of relaxed simd instructions. * Sample codegen tests for added for x86 and aarch64 for some relaxed simd instructions. * Wasmtime embedder support for the relaxed-simd proposal and forcing determinism have been added to `Config` and the CLI. * Support has been added to the `*.wast` runtime execution for the `(either ...)` matcher used in the relaxed-simd proposal. * Tests for relaxed-simd are run both with a default `Engine` as well as a "force deterministic" `Engine` to test both configurations. * All tests from the upstream repository were copied into Wasmtime. These tests should be deleted when WebAssembly/testsuite is updated. * x64: Add x86-specific lowerings for relaxed simd This commit builds on the prior commit and adds an array of `x86_*` instructions to Cranelift which have semantics that match their corresponding x86 equivalents. Translation for relaxed simd is then additionally updated to conditionally generate different CLIF for relaxed simd instructions depending on whether the target is x86 or not. This means that for AArch64 no changes are made but for x86 most relaxed instructions now lower to some x86-equivalent with slightly different semantics than the "deterministic" lowering. * Add libcall support for fma to Wasmtime This will be required to implement the `f32x4.relaxed_madd` instruction (and others) when an x86 host doesn't specify the `has_fma` feature. * Ignore relaxed-simd tests on s390x and riscv64 * Enable relaxed-simd tests on s390x * Update cranelift/codegen/meta/src/shared/instructions.rs Co-authored-by: Andrew Brown <andrew.brown@intel.com> * Add a FIXME from review * Add notes about deterministic semantics * Don't default `has_native_fma` to `true` * Review comments and rebase fixes --------- Co-authored-by: Andrew Brown <andrew.brown@intel.com>
2023-03-07 09:52:41 -06:00
parent e2dcb19099
commit 8bb183f16e
34 changed files with 1727 additions and 37 deletions
--- a/build.rs
+++ b/build.rs
@@ -30,6 +30,12 @@ fn main() -> anyhow::Result<()> {
            test_directory_module(out, "tests/misc_testsuite/threads", strategy)?;
            test_directory_module(out, "tests/misc_testsuite/memory64", strategy)?;
            test_directory_module(out, "tests/misc_testsuite/component-model", strategy)?;
+
+            // NB: these are copied from upstream and updated to wasmtime's
+            // current version of `wast`. This local copy should go away when
+            // all of Wasmtime's tooling is updated and the upstream
+            // `testsuite` module is additionally updated.
+            test_directory_module(out, "tests/misc_testsuite/relaxed-simd", strategy)?;
            Ok(())
        })?;

@@ -64,6 +70,7 @@ fn main() -> anyhow::Result<()> {
    drop(Command::new("rustfmt").arg(&output).status());
    Ok(())
 }
+
 fn test_directory_module(
    out: &mut String,
    path: impl AsRef<Path>,
@@ -182,7 +189,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
        // Currently the simd wasm proposal is not implemented in the riscv64
        // backend so skip all tests which could use simd.
        "riscv64" => {
-            testsuite == "simd" || testname.contains("simd") || testname.contains("memory_multi")
+            testsuite.contains("simd")
+                || testname.contains("simd")
+                || testname.contains("memory_multi")
        }

        _ => false,
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -386,6 +386,27 @@ fn define_simd_lane_access(
        .operands_out(vec![a]),
    );

+    ig.push(
+        Inst::new(
+            "x86_pshufb",
+            r#"
+        A vector swizzle lookalike which has the semantics of `pshufb` on x64.
+
+        This instruction will permute the 8-bit lanes of `x` with the indices
+        specified in `y`. Each lane in the mask, `y`, uses the bottom four
+        bits for selecting the lane from `x` unless the most significant bit
+        is set, in which case the lane is zeroed. The output vector will have
+        the following contents when the element of `y` is in these ranges:
+
+        * `[0, 127]` -> `x[y[i] % 16]`
+        * `[128, 255]` -> 0
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", TxN).with_doc("The vector to modify");
    let y = &Operand::new("y", &TxN.lane_of()).with_doc("New lane value");
    let Idx = &Operand::new("Idx", &imm.uimm8).with_doc("Lane index");
@@ -1436,7 +1457,7 @@ pub(crate) fn define(
        Conditional select of bits.

        For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit
-        in `c` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also:
+        in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also:
        `select`, `vselect`.
        "#,
            &formats.ternary,
@@ -1445,6 +1466,24 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    ig.push(
+        Inst::new(
+            "x86_blendv",
+            r#"
+        A bitselect-lookalike instruction except with the semantics of
+        `blendv`-related instructions on x86.
+
+        This instruction will use the top bit of each lane in `c`, the condition
+        mask. If the bit is 1 then the corresponding lane from `x` is chosen.
+        Otherwise the corresponding lane from `y` is chosen.
+
+            "#,
+            &formats.ternary,
+        )
+        .operands_in(vec![c, x, y])
+        .operands_out(vec![a]),
+    );
+
    let c = &Operand::new("c", &TxN.as_bool()).with_doc("Controlling vector");
    let x = &Operand::new("x", TxN).with_doc("Value to use where `c` is true");
    let y = &Operand::new("y", TxN).with_doc("Value to use where `c` is false");
@@ -1698,6 +1737,22 @@ pub(crate) fn define(
        .operands_out(vec![qa]),
    );

+    ig.push(
+        Inst::new(
+            "x86_pmulhrsw",
+            r#"
+        A similar instruction to `sqmul_round_sat` except with the semantics
+        of x86's `pmulhrsw` instruction.
+
+        This is the same as `sqmul_round_sat` except when both input lanes are
+        `i16::MIN`.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![qx, qy])
+        .operands_out(vec![qa]),
+    );
+
    {
        // Integer division and remainder are scalar-only; most
        // hardware does not directly support vector integer division.
@@ -3135,6 +3190,36 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let I8x16 = &TypeVar::new(
+        "I8x16",
+        "A SIMD vector type consisting of 16 lanes of 8-bit integers",
+        TypeSetBuilder::new()
+            .ints(8..8)
+            .simd_lanes(16..16)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &Operand::new("x", I8x16);
+    let y = &Operand::new("y", I8x16);
+    let a = &Operand::new("a", I16x8);
+
+    ig.push(
+        Inst::new(
+            "x86_pmaddubsw",
+            r#"
+        An instruction with equivalent semantics to `pmaddubsw` on x86.
+
+        This instruction will take signed bytes from the first argument and
+        multiply them against unsigned bytes in the second argument. Adjacent
+        pairs are then added, with saturating, to a 16-bit value and are packed
+        into the result.
+            "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
    let IntTo = &TypeVar::new(
        "IntTo",
        "A larger integer type with the same number of lanes",
@@ -3378,6 +3463,20 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    ig.push(
+        Inst::new(
+            "x86_cvtt2dq",
+            r#"
+        A float-to-integer conversion instruction for vectors-of-floats which
+        has the same semantics as `cvttp{s,d}2dq` on x86. This specifically
+        returns `INT_MIN` for NaN or out-of-bounds lanes.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
    let Int = &TypeVar::new(
        "Int",
        "A scalar or vector integer type",
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -214,6 +214,10 @@ impl TargetIsa for AArch64Backend {
        cs.set_skipdata(true)?;
        Ok(cs)
    }
+
+    fn has_native_fma(&self) -> bool {
+        true
+    }
 }

 impl fmt::Display for AArch64Backend {
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -315,6 +315,13 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    fn to_capstone(&self) -> Result<capstone::Capstone, capstone::Error> {
        Err(capstone::Error::UnsupportedArch)
    }
+
+    /// Returns whether this ISA has a native fused-multiply-and-add instruction
+    /// for floats.
+    ///
+    /// Currently this only returns false on x86 when some native features are
+    /// not detected.
+    fn has_native_fma(&self) -> bool;
 }

 /// Methods implemented for free for target ISA!
--- a/cranelift/codegen/src/isa/riscv64/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/mod.rs
@@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend {
        cs.set_skipdata(true)?;
        Ok(cs)
    }
+
+    fn has_native_fma(&self) -> bool {
+        true
+    }
 }

 impl fmt::Display for Riscv64Backend {
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend {

        Ok(cs)
    }
+
+    fn has_native_fma(&self) -> bool {
+        true
+    }
 }

 impl fmt::Display for S390xBackend {
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1212,6 +1212,20 @@
 (decl pure vconst_all_ones_or_all_zeros () Constant)
 (extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)

+;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I8X16
+                       (x86_blendv condition if_true if_false)))
+      (x64_pblendvb if_false if_true condition))
+
+(rule (lower (has_type $I32X4
+                       (x86_blendv condition if_true if_false)))
+      (x64_blendvps if_false if_true condition))
+
+(rule (lower (has_type $I64X2
+                       (x86_blendv condition if_true if_false)))
+      (x64_blendvpd if_false if_true condition))
+
 ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type ty @ (multi_lane _bits _lanes)
@@ -2145,6 +2159,11 @@
 (rule (lower (debugtrap))
      (side_effect (x64_hlt)))

+;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I16X8 (x86_pmaddubsw x y)))
+      (x64_pmaddubsw y x))
+
 ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type $F32 (fadd x y)))
@@ -3169,6 +3188,11 @@
        ;; values greater than max signed int.
        (x64_paddd tmp1 dst)))

+;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I32X4 (x86_cvtt2dq val @ (value_type $F32X4))))
+      (x64_cvttps2dq val))
+
 ;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type $I16X8 (iadd_pairwise x y)))
@@ -3304,6 +3328,12 @@
            (dst Xmm (x64_minpd a tmp1)))
        (x64_cvttpd2dq dst)))

+;; This rule is a special case for handling the translation of the wasm op
+;; `i32x4.relaxed_trunc_f64x2_s_zero`.
+(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (x86_cvtt2dq val))
+                                       (vconst (u128_from_constant 0)))))
+        (x64_cvttpd2dq val))
+
 ;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type $I8X16 (unarrow a @ (value_type $I16X8) b)))
@@ -3559,6 +3589,11 @@
      (let ((mask Xmm (x64_paddusb mask (swizzle_zero_mask))))
        (x64_pshufb src mask)))

+;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (x86_pshufb src mask))
+      (x64_pshufb src mask))
+
 ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Remove the extractlane instruction, leaving the float where it is. The upper
@@ -3736,7 +3771,12 @@
            (cmp Xmm (x64_pcmpeqw dst mask)))
        (x64_pxor dst cmp)))

-;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (x86_pmulhrsw qx @ (value_type $I16X8) qy))
+      (x64_pmulhrsw qx qy))
+
+;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; TODO: currently we only lower a special case of `uunarrow` needed to support
 ;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation.
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -184,6 +184,10 @@ impl TargetIsa for X64Backend {
            .syntax(arch::x86::ArchSyntax::Att)
            .build()
    }
+
+    fn has_native_fma(&self) -> bool {
+        self.x64_flags.use_fma()
+    }
 }

 impl fmt::Display for X64Backend {
--- a/cranelift/filetests/filetests/wasm/aarch64-relaxed-simd.wat
+++ b/cranelift/filetests/filetests/wasm/aarch64-relaxed-simd.wat
@@ -0,0 +1,87 @@
+;;! target = "aarch64"
+;;! compile = true
+
+(module
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_s
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_u
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_s_zero
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_u_zero
+  )
+
+  (func (param v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    i16x8.relaxed_dot_i8x16_i7x16_s
+  )
+
+  (func (param v128 v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    local.get 2
+    i32x4.relaxed_dot_i8x16_i7x16_add_s
+  )
+)
+
+;; function u0:0:
+;; block0:
+;;   fcvtzs v0.4s, v0.4s
+;;   b label1
+;; block1:
+;;   ret
+;;
+;; function u0:1:
+;; block0:
+;;   fcvtzu v0.4s, v0.4s
+;;   b label1
+;; block1:
+;;   ret
+;;
+;; function u0:2:
+;; block0:
+;;   fcvtzs v4.2d, v0.2d
+;;   sqxtn v0.2s, v4.2d
+;;   b label1
+;; block1:
+;;   ret
+;;
+;; function u0:3:
+;; block0:
+;;   fcvtzu v4.2d, v0.2d
+;;   uqxtn v0.2s, v4.2d
+;;   b label1
+;; block1:
+;;   ret
+;;
+;; function u0:4:
+;; block0:
+;;   smull v6.8h, v0.8b, v1.8b
+;;   smull2 v7.8h, v0.16b, v1.16b
+;;   addp v0.8h, v6.8h, v7.8h
+;;   b label1
+;; block1:
+;;   ret
+;;
+;; function u0:5:
+;; block0:
+;;   smull v17.8h, v0.8b, v1.8b
+;;   smull2 v18.8h, v0.16b, v1.16b
+;;   addp v17.8h, v17.8h, v18.8h
+;;   saddlp v17.4s, v17.8h
+;;   add v0.4s, v17.4s, v2.4s
+;;   b label1
+;; block1:
+;;   ret
--- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat
+++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat
@@ -0,0 +1,161 @@
+;;! target = "x86_64"
+;;! compile = true
+;;! relaxed_simd_deterministic = true
+;;! settings = ["has_avx=true"]
+
+(module
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_s
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_u
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_s_zero
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_u_zero
+  )
+
+  (func (param v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    i16x8.relaxed_dot_i8x16_i7x16_s
+  )
+
+  (func (param v128 v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    local.get 2
+    i32x4.relaxed_dot_i8x16_i7x16_add_s
+  )
+)
+
+;; function u0:0:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   vcmpps  $0 %xmm0, %xmm0, %xmm3
+;;   vandps  %xmm0, %xmm3, %xmm5
+;;   vpxor   %xmm3, %xmm5, %xmm7
+;;   vcvttps2dq %xmm5, %xmm9
+;;   vpand   %xmm9, %xmm7, %xmm11
+;;   vpsrad  %xmm11, $31, %xmm13
+;;   vpxor   %xmm13, %xmm9, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:1:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   xorps   %xmm3, %xmm3, %xmm3
+;;   vmaxps  %xmm0, %xmm3, %xmm5
+;;   vpcmpeqd %xmm3, %xmm3, %xmm7
+;;   vpsrld  %xmm7, $1, %xmm9
+;;   vcvtdq2ps %xmm9, %xmm11
+;;   vcvttps2dq %xmm5, %xmm13
+;;   vsubps  %xmm5, %xmm11, %xmm15
+;;   vcmpps  $2 %xmm11, %xmm15, %xmm1
+;;   vcvttps2dq %xmm15, %xmm3
+;;   vpxor   %xmm3, %xmm1, %xmm5
+;;   pxor    %xmm7, %xmm7, %xmm7
+;;   vpmaxsd %xmm5, %xmm7, %xmm9
+;;   vpaddd  %xmm9, %xmm13, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:2:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   vcmppd  $0 %xmm0, %xmm0, %xmm3
+;;   vandps  %xmm3, const(0), %xmm5
+;;   vminpd  %xmm0, %xmm5, %xmm7
+;;   vcvttpd2dq %xmm7, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:3:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   xorpd   %xmm3, %xmm3, %xmm3
+;;   vmaxpd  %xmm0, %xmm3, %xmm5
+;;   vminpd  %xmm5, const(0), %xmm7
+;;   vroundpd $3, %xmm7, %xmm9
+;;   vaddpd  %xmm9, const(1), %xmm11
+;;   vshufps $136 %xmm11, %xmm3, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:4:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   vpmovsxbw %xmm0, %xmm10
+;;   vpmovsxbw %xmm1, %xmm12
+;;   vpmullw %xmm10, %xmm12, %xmm14
+;;   vpalignr $8 %xmm0, %xmm0, %xmm8
+;;   vpmovsxbw %xmm8, %xmm10
+;;   vpalignr $8 %xmm1, %xmm1, %xmm12
+;;   vpmovsxbw %xmm12, %xmm15
+;;   vpmullw %xmm10, %xmm15, %xmm0
+;;   vphaddw %xmm14, %xmm0, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:5:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   vpmovsxbw %xmm0, %xmm13
+;;   vpmovsxbw %xmm1, %xmm15
+;;   vpmullw %xmm13, %xmm15, %xmm3
+;;   vpalignr $8 %xmm0, %xmm0, %xmm11
+;;   vpmovsxbw %xmm11, %xmm13
+;;   vpalignr $8 %xmm1, %xmm1, %xmm15
+;;   vpmovsxbw %xmm15, %xmm1
+;;   vpmullw %xmm13, %xmm1, %xmm4
+;;   vphaddw %xmm3, %xmm4, %xmm15
+;;   vpmaddwd %xmm15, const(0), %xmm15
+;;   vpaddd  %xmm15, %xmm2, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
--- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat
+++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat
@@ -0,0 +1,140 @@
+;;! target = "x86_64"
+;;! compile = true
+
+(module
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_s
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f32x4_u
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_s_zero
+  )
+
+  (func (param v128) (result v128)
+    local.get 0
+    i32x4.relaxed_trunc_f64x2_u_zero
+  )
+
+  (func (param v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    i16x8.relaxed_dot_i8x16_i7x16_s
+  )
+
+  (func (param v128 v128 v128) (result v128)
+    local.get 0
+    local.get 1
+    local.get 2
+    i32x4.relaxed_dot_i8x16_i7x16_add_s
+  )
+)
+
+;; function u0:0:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   cvttps2dq %xmm0, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:1:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   xorps   %xmm6, %xmm6, %xmm6
+;;   movdqa  %xmm0, %xmm10
+;;   maxps   %xmm10, %xmm6, %xmm10
+;;   pcmpeqd %xmm6, %xmm6, %xmm6
+;;   psrld   %xmm6, $1, %xmm6
+;;   cvtdq2ps %xmm6, %xmm14
+;;   cvttps2dq %xmm10, %xmm13
+;;   subps   %xmm10, %xmm14, %xmm10
+;;   cmpps   $2, %xmm14, %xmm10, %xmm14
+;;   cvttps2dq %xmm10, %xmm0
+;;   pxor    %xmm0, %xmm14, %xmm0
+;;   pxor    %xmm7, %xmm7, %xmm7
+;;   pmaxsd  %xmm0, %xmm7, %xmm0
+;;   paddd   %xmm0, %xmm13, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:2:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   cvttpd2dq %xmm0, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:3:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   xorpd   %xmm3, %xmm3, %xmm3
+;;   movdqa  %xmm0, %xmm6
+;;   maxpd   %xmm6, %xmm3, %xmm6
+;;   minpd   %xmm6, const(0), %xmm6
+;;   roundpd $3, %xmm6, %xmm0
+;;   addpd   %xmm0, const(1), %xmm0
+;;   shufps  $136, %xmm0, %xmm3, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:4:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   movdqa  %xmm1, %xmm4
+;;   pmaddubsw %xmm4, %xmm0, %xmm4
+;;   movdqa  %xmm4, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:5:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   movdqa  %xmm0, %xmm8
+;;   movdqa  %xmm1, %xmm0
+;;   pmaddubsw %xmm0, %xmm8, %xmm0
+;;   pmaddwd %xmm0, const(0), %xmm0
+;;   paddd   %xmm0, %xmm2, %xmm0
+;;   jmp     label1
+;; block1:
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
--- a/cranelift/filetests/src/test_wasm/config.rs
+++ b/cranelift/filetests/src/test_wasm/config.rs
@@ -29,6 +29,9 @@ pub struct TestConfig {

    #[serde(default)]
    pub heaps: Vec<TestHeap>,
+
+    #[serde(default)]
+    pub relaxed_simd_deterministic: bool,
 }

 impl TestConfig {
--- a/cranelift/filetests/src/test_wasm/env.rs
+++ b/cranelift/filetests/src/test_wasm/env.rs
@@ -82,6 +82,7 @@ impl<'data> ModuleEnvironment<'data> for ModuleEnv {
        wasmparser::WasmFeatures {
            memory64: true,
            multi_memory: true,
+            relaxed_simd: true,
            ..self.inner.wasm_features()
        }
    }
@@ -613,4 +614,12 @@ impl<'a> FuncEnvironment for FuncEnv<'a> {
    {
        self.inner.heaps()
    }
+
+    fn relaxed_simd_deterministic(&self) -> bool {
+        self.config.relaxed_simd_deterministic
+    }
+
+    fn is_x86(&self) -> bool {
+        self.config.target.contains("x86_64")
+    }
 }
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -1358,6 +1358,11 @@ where
        Opcode::GetFramePointer => unimplemented!("GetFramePointer"),
        Opcode::GetStackPointer => unimplemented!("GetStackPointer"),
        Opcode::GetReturnAddress => unimplemented!("GetReturnAddress"),
+        Opcode::X86Pshufb => unimplemented!("X86Pshufb"),
+        Opcode::X86Blendv => unimplemented!("X86Blendv"),
+        Opcode::X86Pmulhrsw => unimplemented!("X86Pmulhrsw"),
+        Opcode::X86Pmaddubsw => unimplemented!("X86Pmaddubsw"),
+        Opcode::X86Cvtt2dq => unimplemented!("X86Cvtt2dq"),
    })
 }

--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1778,13 +1778,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            state.push1(builder.ins().sshr(bitcast_a, b))
        }
        Operator::V128Bitselect => {
-            let (a, b, c) = state.pop3();
-            let bitcast_a = optionally_bitcast_vector(a, I8X16, builder);
-            let bitcast_b = optionally_bitcast_vector(b, I8X16, builder);
-            let bitcast_c = optionally_bitcast_vector(c, I8X16, builder);
+            let (a, b, c) = pop3_with_bitcast(state, I8X16, builder);
            // The CLIF operand ordering is slightly different and the types of all three
            // operands must match (hence the bitcast).
-            state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b))
+            state.push1(builder.ins().bitselect(c, a, b))
        }
        Operator::V128AnyTrue => {
            let a = pop1_with_bitcast(state, type_of(op), builder);
@@ -1938,11 +1935,23 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(

            state.push1(builder.ins().snarrow(converted_a, zero));
        }
-        Operator::I32x4TruncSatF32x4U => {
+
+        // FIXME(#5913): the relaxed instructions here are translated the same
+        // as the saturating instructions, even when the code generator
+        // configuration allow for different semantics across hosts. On x86,
+        // however, it's theoretically possible to have a slightly more optimal
+        // lowering which accounts for NaN differently, although the lowering is
+        // still not trivial (e.g. one instruction). At this time the
+        // more-optimal-but-still-large lowering for x86 is not implemented so
+        // the relaxed instructions are listed here instead of down below with
+        // the other relaxed instructions. An x86-specific implementation (or
+        // perhaps for other backends too) should be added and the codegen for
+        // the relaxed instruction should conditionally be different.
+        Operator::I32x4RelaxedTruncF32x4U | Operator::I32x4TruncSatF32x4U => {
            let a = pop1_with_bitcast(state, F32X4, builder);
            state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a))
        }
-        Operator::I32x4TruncSatF64x2UZero => {
+        Operator::I32x4RelaxedTruncF64x2UZero | Operator::I32x4TruncSatF64x2UZero => {
            let a = pop1_with_bitcast(state, F64X2, builder);
            let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
@@ -1950,6 +1959,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(

            state.push1(builder.ins().uunarrow(converted_a, zero));
        }
+
        Operator::I8x16NarrowI16x8S => {
            let (a, b) = pop2_with_bitcast(state, I16X8, builder);
            state.push1(builder.ins().snarrow(a, b))
@@ -2156,27 +2166,175 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
                op
            ));
        }
-        Operator::I8x16RelaxedSwizzle
-        | Operator::I32x4RelaxedTruncF32x4S
-        | Operator::I32x4RelaxedTruncF32x4U
-        | Operator::I32x4RelaxedTruncF64x2SZero
-        | Operator::I32x4RelaxedTruncF64x2UZero
-        | Operator::F32x4RelaxedMadd
-        | Operator::F32x4RelaxedNmadd
-        | Operator::F64x2RelaxedMadd
-        | Operator::F64x2RelaxedNmadd
-        | Operator::I8x16RelaxedLaneselect
+
+        Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics match the `fmax` instruction, or
+                    // the `fAAxBB.max` wasm instruction.
+                    builder.ins().fmax(a, b)
+                } else {
+                    builder.ins().fmax_pseudo(a, b)
+                },
+            )
+        }
+
+        Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => {
+            let (a, b) = pop2_with_bitcast(state, type_of(op), builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics match the `fmin` instruction, or
+                    // the `fAAxBB.min` wasm instruction.
+                    builder.ins().fmin(a, b)
+                } else {
+                    builder.ins().fmin_pseudo(a, b)
+                },
+            );
+        }
+
+        Operator::I8x16RelaxedSwizzle => {
+            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics match the `i8x16.swizzle`
+                    // instruction which is the CLIF `swizzle`.
+                    builder.ins().swizzle(a, b)
+                } else {
+                    builder.ins().x86_pshufb(a, b)
+                },
+            );
+        }
+
+        Operator::F32x4RelaxedMadd | Operator::F64x2RelaxedMadd => {
+            let (a, b, c) = pop3_with_bitcast(state, type_of(op), builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || environ.has_native_fma() {
+                    // Deterministic semantics are "fused multiply and add"
+                    // which the CLIF `fma` guarantees.
+                    builder.ins().fma(a, b, c)
+                } else {
+                    let mul = builder.ins().fmul(a, b);
+                    builder.ins().fadd(mul, c)
+                },
+            );
+        }
+        Operator::F32x4RelaxedNmadd | Operator::F64x2RelaxedNmadd => {
+            let (a, b, c) = pop3_with_bitcast(state, type_of(op), builder);
+            let a = builder.ins().fneg(a);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || environ.has_native_fma() {
+                    // Deterministic semantics are "fused multiply and add"
+                    // which the CLIF `fma` guarantees.
+                    builder.ins().fma(a, b, c)
+                } else {
+                    let mul = builder.ins().fmul(a, b);
+                    builder.ins().fadd(mul, c)
+                },
+            );
+        }
+
+        Operator::I8x16RelaxedLaneselect
        | Operator::I16x8RelaxedLaneselect
        | Operator::I32x4RelaxedLaneselect
-        | Operator::I64x2RelaxedLaneselect
-        | Operator::F32x4RelaxedMin
-        | Operator::F32x4RelaxedMax
-        | Operator::F64x2RelaxedMin
-        | Operator::F64x2RelaxedMax
-        | Operator::I16x8RelaxedQ15mulrS
-        | Operator::I16x8RelaxedDotI8x16I7x16S
-        | Operator::I32x4RelaxedDotI8x16I7x16AddS => {
-            return Err(wasm_unsupported!("proposed relaxed-simd operator {:?}", op));
+        | Operator::I64x2RelaxedLaneselect => {
+            let ty = type_of(op);
+            let (a, b, c) = pop3_with_bitcast(state, ty, builder);
+            // Note that the variable swaps here are intentional due to
+            // the difference of the order of the wasm op and the clif
+            // op.
+            //
+            // Additionally note that even on x86 the I16X8 type uses the
+            // `bitselect` instruction since x86 has no corresponding
+            // `blendv`-style instruction for 16-bit operands.
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() || ty == I16X8 {
+                    // Deterministic semantics are a `bitselect` along the lines
+                    // of the wasm `v128.bitselect` instruction.
+                    builder.ins().bitselect(c, a, b)
+                } else {
+                    builder.ins().x86_blendv(c, a, b)
+                },
+            );
+        }
+
+        Operator::I32x4RelaxedTruncF32x4S => {
+            let a = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics are to match the
+                    // `i32x4.trunc_sat_f32x4_s` instruction.
+                    builder.ins().fcvt_to_sint_sat(I32X4, a)
+                } else {
+                    builder.ins().x86_cvtt2dq(I32X4, a)
+                },
+            )
+        }
+        Operator::I32x4RelaxedTruncF64x2SZero => {
+            let a = pop1_with_bitcast(state, F64X2, builder);
+            let converted_a = if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                // Deterministic semantics are to match the
+                // `i32x4.trunc_sat_f64x2_s_zero` instruction.
+                builder.ins().fcvt_to_sint_sat(I64X2, a)
+            } else {
+                builder.ins().x86_cvtt2dq(I64X2, a)
+            };
+            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
+            let zero = builder.ins().vconst(I64X2, handle);
+
+            state.push1(builder.ins().snarrow(converted_a, zero));
+        }
+        Operator::I16x8RelaxedQ15mulrS => {
+            let (a, b) = pop2_with_bitcast(state, I16X8, builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics are to match the
+                    // `i16x8.q15mulr_sat_s` instruction.
+                    builder.ins().sqmul_round_sat(a, b)
+                } else {
+                    builder.ins().x86_pmulhrsw(a, b)
+                },
+            );
+        }
+        Operator::I16x8RelaxedDotI8x16I7x16S => {
+            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
+            state.push1(
+                if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                    // Deterministic semantics are to treat both operands as
+                    // signed integers and perform the dot product.
+                    let alo = builder.ins().swiden_low(a);
+                    let blo = builder.ins().swiden_low(b);
+                    let lo = builder.ins().imul(alo, blo);
+                    let ahi = builder.ins().swiden_high(a);
+                    let bhi = builder.ins().swiden_high(b);
+                    let hi = builder.ins().imul(ahi, bhi);
+                    builder.ins().iadd_pairwise(lo, hi)
+                } else {
+                    builder.ins().x86_pmaddubsw(a, b)
+                },
+            );
+        }
+
+        Operator::I32x4RelaxedDotI8x16I7x16AddS => {
+            let c = pop1_with_bitcast(state, I32X4, builder);
+            let (a, b) = pop2_with_bitcast(state, I8X16, builder);
+            let dot = if environ.relaxed_simd_deterministic() || !environ.is_x86() {
+                // Deterministic semantics are to treat both operands as
+                // signed integers and perform the dot product.
+                let alo = builder.ins().swiden_low(a);
+                let blo = builder.ins().swiden_low(b);
+                let lo = builder.ins().imul(alo, blo);
+                let ahi = builder.ins().swiden_high(a);
+                let bhi = builder.ins().swiden_high(b);
+                let hi = builder.ins().imul(ahi, bhi);
+                builder.ins().iadd_pairwise(lo, hi)
+            } else {
+                builder.ins().x86_pmaddubsw(a, b)
+            };
+            let dotlo = builder.ins().swiden_low(dot);
+            let dothi = builder.ins().swiden_high(dot);
+            let dot32 = builder.ins().iadd_pairwise(dotlo, dothi);
+            state.push1(builder.ins().iadd(dot32, c));
        }

        Operator::CallRef { .. }
@@ -2945,7 +3103,8 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I8x16MaxU
        | Operator::I8x16AvgrU
        | Operator::I8x16Bitmask
-        | Operator::I8x16Popcnt => I8X16,
+        | Operator::I8x16Popcnt
+        | Operator::I8x16RelaxedLaneselect => I8X16,

        Operator::I16x8Splat
        | Operator::V128Load16Splat { .. }
@@ -2982,7 +3141,8 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I16x8MaxU
        | Operator::I16x8AvgrU
        | Operator::I16x8Mul
-        | Operator::I16x8Bitmask => I16X8,
+        | Operator::I16x8Bitmask
+        | Operator::I16x8RelaxedLaneselect => I16X8,

        Operator::I32x4Splat
        | Operator::V128Load32Splat { .. }
@@ -3016,6 +3176,7 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I32x4Bitmask
        | Operator::I32x4TruncSatF32x4S
        | Operator::I32x4TruncSatF32x4U
+        | Operator::I32x4RelaxedLaneselect
        | Operator::V128Load32Zero { .. } => I32X4,

        Operator::I64x2Splat
@@ -3040,6 +3201,7 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::I64x2Sub
        | Operator::I64x2Mul
        | Operator::I64x2Bitmask
+        | Operator::I64x2RelaxedLaneselect
        | Operator::V128Load64Zero { .. } => I64X2,

        Operator::F32x4Splat
@@ -3067,7 +3229,11 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::F32x4Ceil
        | Operator::F32x4Floor
        | Operator::F32x4Trunc
-        | Operator::F32x4Nearest => F32X4,
+        | Operator::F32x4Nearest
+        | Operator::F32x4RelaxedMax
+        | Operator::F32x4RelaxedMin
+        | Operator::F32x4RelaxedMadd
+        | Operator::F32x4RelaxedNmadd => F32X4,

        Operator::F64x2Splat
        | Operator::F64x2ExtractLane { .. }
@@ -3092,7 +3258,11 @@ fn type_of(operator: &Operator) -> Type {
        | Operator::F64x2Ceil
        | Operator::F64x2Floor
        | Operator::F64x2Trunc
-        | Operator::F64x2Nearest => F64X2,
+        | Operator::F64x2Nearest
+        | Operator::F64x2RelaxedMax
+        | Operator::F64x2RelaxedMin
+        | Operator::F64x2RelaxedMadd
+        | Operator::F64x2RelaxedNmadd => F64X2,

        _ => unimplemented!(
            "Currently only SIMD instructions are mapped to their return type; the \
@@ -3219,6 +3389,18 @@ fn pop2_with_bitcast(
    (bitcast_a, bitcast_b)
 }

+fn pop3_with_bitcast(
+    state: &mut FuncTranslationState,
+    needed_type: Type,
+    builder: &mut FunctionBuilder,
+) -> (Value, Value, Value) {
+    let (a, b, c) = state.pop3();
+    let bitcast_a = optionally_bitcast_vector(a, needed_type, builder);
+    let bitcast_b = optionally_bitcast_vector(b, needed_type, builder);
+    let bitcast_c = optionally_bitcast_vector(c, needed_type, builder);
+    (bitcast_a, bitcast_b, bitcast_c)
+}
+
 fn bitcast_arguments<'a>(
    builder: &FunctionBuilder,
    arguments: &'a mut [Value],
--- a/cranelift/wasm/src/environ/spec.rs
+++ b/cranelift/wasm/src/environ/spec.rs
@@ -525,6 +525,27 @@ pub trait FuncEnvironment: TargetEnvironment {
    /// Returns the target ISA's condition to check for unsigned addition
    /// overflowing.
    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC;
+
+    /// Whether or not to force relaxed simd instructions to have deterministic
+    /// lowerings meaning they will produce the same results across all hosts,
+    /// regardless of the cost to performance.
+    fn relaxed_simd_deterministic(&self) -> bool {
+        true
+    }
+
+    /// Whether or not the target being translated for has a native fma
+    /// instruction. If it does not then when relaxed simd isn't deterministic
+    /// the translation of the `f32x4.relaxed_fma` instruction, for example,
+    /// will do a multiplication and then an add instead of the fused version.
+    fn has_native_fma(&self) -> bool {
+        false
+    }
+
+    /// Returns whether this is an x86 target, which may alter lowerings of
+    /// relaxed simd instructions.
+    fn is_x86(&self) -> bool {
+        false
+    }
 }

 /// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the
--- a/crates/cli-flags/src/lib.rs
+++ b/crates/cli-flags/src/lib.rs
@@ -35,6 +35,10 @@ pub const SUPPORTED_WASM_FEATURES: &[(&str, &str)] = &[
    ("multi-value", "enables support for multi-value functions"),
    ("reference-types", "enables support for reference types"),
    ("simd", "enables support for proposed SIMD instructions"),
+    (
+        "relaxed-simd",
+        "enables support for the relaxed simd proposal",
+    ),
    ("threads", "enables support for WebAssembly threads"),
    ("memory64", "enables support for 64-bit memories"),
    #[cfg(feature = "component-model")]
@@ -235,6 +239,17 @@ pub struct CommonOptions {
    /// stack overflow is reported.
    #[clap(long)]
    pub max_wasm_stack: Option<usize>,
+
+    /// Whether or not to force deterministic and host-independent behavior of
+    /// the relaxed-simd instructions.
+    ///
+    /// By default these instructions may have architecture-specific behavior as
+    /// allowed by the specification, but this can be used to force the behavior
+    /// of these instructions to match the deterministic behavior classified in
+    /// the specification. Note that enabling this option may come at a
+    /// performance cost.
+    #[clap(long)]
+    pub relaxed_simd_deterministic: bool,
 }

 impl CommonOptions {
@@ -329,12 +344,15 @@ impl CommonOptions {
            config.max_wasm_stack(max);
        }

+        config.relaxed_simd_deterministic(self.relaxed_simd_deterministic);
+
        Ok(config)
    }

    pub fn enable_wasm_features(&self, config: &mut Config) {
        let WasmFeatures {
            simd,
+            relaxed_simd,
            bulk_memory,
            reference_types,
            multi_value,
@@ -348,6 +366,9 @@ impl CommonOptions {
        if let Some(enable) = simd {
            config.wasm_simd(enable);
        }
+        if let Some(enable) = relaxed_simd {
+            config.wasm_relaxed_simd(enable);
+        }
        if let Some(enable) = bulk_memory {
            config.wasm_bulk_memory(enable);
        }
@@ -400,6 +421,7 @@ pub struct WasmFeatures {
    pub multi_value: Option<bool>,
    pub bulk_memory: Option<bool>,
    pub simd: Option<bool>,
+    pub relaxed_simd: Option<bool>,
    pub threads: Option<bool>,
    pub multi_memory: Option<bool>,
    pub memory64: Option<bool>,
@@ -450,6 +472,7 @@ fn parse_wasm_features(features: &str) -> Result<WasmFeatures> {
        multi_value: all.or(values["multi-value"]),
        bulk_memory: all.or(values["bulk-memory"]),
        simd: all.or(values["simd"]),
+        relaxed_simd: all.or(values["relaxed-simd"]),
        threads: all.or(values["threads"]),
        multi_memory: all.or(values["multi-memory"]),
        memory64: all.or(values["memory64"]),
@@ -560,6 +583,7 @@ mod test {
            multi_value,
            bulk_memory,
            simd,
+            relaxed_simd,
            threads,
            multi_memory,
            memory64,
@@ -572,6 +596,7 @@ mod test {
        assert_eq!(threads, Some(true));
        assert_eq!(multi_memory, Some(true));
        assert_eq!(memory64, Some(true));
+        assert_eq!(relaxed_simd, Some(true));

        Ok(())
    }
@@ -585,6 +610,7 @@ mod test {
            multi_value,
            bulk_memory,
            simd,
+            relaxed_simd,
            threads,
            multi_memory,
            memory64,
@@ -597,6 +623,7 @@ mod test {
        assert_eq!(threads, Some(false));
        assert_eq!(multi_memory, Some(false));
        assert_eq!(memory64, Some(false));
+        assert_eq!(relaxed_simd, Some(false));

        Ok(())
    }
@@ -613,6 +640,7 @@ mod test {
            multi_value,
            bulk_memory,
            simd,
+            relaxed_simd,
            threads,
            multi_memory,
            memory64,
@@ -625,6 +653,7 @@ mod test {
        assert_eq!(threads, None);
        assert_eq!(multi_memory, Some(true));
        assert_eq!(memory64, Some(true));
+        assert_eq!(relaxed_simd, None);

        Ok(())
    }
@@ -662,6 +691,7 @@ mod test {
    feature_test!(test_multi_value_feature, multi_value, "multi-value");
    feature_test!(test_bulk_memory_feature, bulk_memory, "bulk-memory");
    feature_test!(test_simd_feature, simd, "simd");
+    feature_test!(test_relaxed_simd_feature, relaxed_simd, "relaxed-simd");
    feature_test!(test_threads_feature, threads, "threads");
    feature_test!(test_multi_memory_feature, multi_memory, "multi-memory");
    feature_test!(test_memory64_feature, memory64, "memory64");
--- a/crates/cranelift/src/func_environ.rs
+++ b/crates/cranelift/src/func_environ.rs
@@ -2153,4 +2153,16 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m
    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
        self.isa.unsigned_add_overflow_condition()
    }
+
+    fn relaxed_simd_deterministic(&self) -> bool {
+        self.tunables.relaxed_simd_deterministic
+    }
+
+    fn has_native_fma(&self) -> bool {
+        self.isa.has_native_fma()
+    }
+
+    fn is_x86(&self) -> bool {
+        self.isa.triple().architecture == target_lexicon::Architecture::X86_64
+    }
 }
--- a/crates/cranelift/src/obj.rs
+++ b/crates/cranelift/src/obj.rs
@@ -545,6 +545,8 @@ fn libcall_name(call: LibCall) -> &'static str {
        LibCall::CeilF64 => LC::CeilF64,
        LibCall::TruncF32 => LC::TruncF32,
        LibCall::TruncF64 => LC::TruncF64,
+        LibCall::FmaF32 => LC::FmaF32,
+        LibCall::FmaF64 => LC::FmaF64,
        _ => panic!("unknown libcall to give a name to: {call:?}"),
    };
    other.symbol()
--- a/crates/environ/src/obj.rs
+++ b/crates/environ/src/obj.rs
@@ -166,4 +166,6 @@ libcalls! {
    CeilF64 = "libcall_ceilf64"
    TruncF32 = "libcall_truncf32"
    TruncF64 = "libcall_truncf64"
+    FmaF32 = "libcall_fmaf32"
+    FmaF64 = "libcall_fmaf64"
 }
--- a/crates/environ/src/tunables.rs
+++ b/crates/environ/src/tunables.rs
@@ -45,6 +45,10 @@ pub struct Tunables {
    /// Flag for the component module whether adapter modules have debug
    /// assertions baked into them.
    pub debug_adapter_modules: bool,
+
+    /// Whether or not lowerings for relaxed simd instructions are forced to
+    /// be deterministic.
+    pub relaxed_simd_deterministic: bool,
 }

 impl Default for Tunables {
@@ -91,6 +95,7 @@ impl Default for Tunables {
            guard_before_linear_memory: true,
            generate_address_map: true,
            debug_adapter_modules: false,
+            relaxed_simd_deterministic: false,
        }
    }
 }
--- a/crates/jit/src/code_memory.rs
+++ b/crates/jit/src/code_memory.rs
@@ -296,6 +296,8 @@ impl CodeMemory {
                obj::LibCall::CeilF64 => libcalls::relocs::ceilf64 as usize,
                obj::LibCall::TruncF32 => libcalls::relocs::truncf32 as usize,
                obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize,
+                obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize,
+                obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize,
            };
            *self.mmap.as_mut_ptr().add(offset).cast::<usize>() = libcall;
        }
--- a/crates/runtime/src/libcalls.rs
+++ b/crates/runtime/src/libcalls.rs
@@ -584,4 +584,12 @@ pub mod relocs {
            (x.abs() + TOINT_64 - TOINT_64).copysign(x)
        }
    }
+
+    pub extern "C" fn fmaf32(a: f32, b: f32, c: f32) -> f32 {
+        a.mul_add(b, c)
+    }
+
+    pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 {
+        a.mul_add(b, c)
+    }
 }
--- a/crates/wasmtime/src/config.rs
+++ b/crates/wasmtime/src/config.rs
@@ -682,6 +682,56 @@ impl Config {
        self
    }

+    /// Configures whether the WebAssembly Relaxed SIMD proposal will be
+    /// enabled for compilation.
+    ///
+    /// The [WebAssembly Relaxed SIMD proposal][proposal] is not, at the time of
+    /// this writing, at stage 4. The relaxed SIMD proposal adds new
+    /// instructions to WebAssembly which, for some specific inputs, are allowed
+    /// to produce different results on different hosts. More-or-less this
+    /// proposal enables exposing platform-specific semantics of SIMD
+    /// instructions in a controlled fashion to a WebAssembly program. From an
+    /// embedder's perspective this means that WebAssembly programs may execute
+    /// differently depending on whether the host is x86_64 or AArch64, for
+    /// example.
+    ///
+    /// By default Wasmtime lowers relaxed SIMD instructions to the fastest
+    /// lowering for the platform it's running on. This means that, by default,
+    /// some relaxed SIMD instructions may have different results for the same
+    /// inputs across x86_64 and AArch64. This behavior can be disabled through
+    /// the [`Config::relaxed_simd_deterministic`] option which will force
+    /// deterministic behavior across all platforms, as classified by the
+    /// specification, at the cost of performance.
+    ///
+    /// This is `false` by default.
+    ///
+    /// [proposal]: https://github.com/webassembly/relaxed-simd
+    pub fn wasm_relaxed_simd(&mut self, enable: bool) -> &mut Self {
+        self.features.relaxed_simd = enable;
+        self
+    }
+
+    /// This option can be used to control the behavior of the [relaxed SIMD
+    /// proposal's][proposal] instructions.
+    ///
+    /// The relaxed SIMD proposal introduces instructions that are allowed to
+    /// have different behavior on different architectures, primarily to afford
+    /// an efficient implementation on all architectures. This means, however,
+    /// that the same module may execute differently on one host than another,
+    /// which typically is not otherwise the case. This option is provided to
+    /// force Wasmtime to generate deterministic code for all relaxed simd
+    /// instructions, at the cost of performance, for all architectures. When
+    /// this option is enabled then the deterministic behavior of all
+    /// instructions in the relaxed SIMD proposal is selected.
+    ///
+    /// This is `false` by default.
+    ///
+    /// [proposal]: https://github.com/webassembly/relaxed-simd
+    pub fn relaxed_simd_deterministic(&mut self, enable: bool) -> &mut Self {
+        self.tunables.relaxed_simd_deterministic = enable;
+        self
+    }
+
    /// Configures whether the [WebAssembly bulk memory operations
    /// proposal][proposal] will be enabled for compilation.
    ///
@@ -1560,6 +1610,10 @@ impl Config {
            }
        }

+        if self.features.relaxed_simd && !self.features.simd {
+            bail!("cannot disable the simd proposal but enable the relaxed simd proposal");
+        }
+
        // Apply compiler settings and flags
        for (k, v) in self.compiler_config.settings.iter() {
            compiler.set(k, v)?;
@@ -1608,6 +1662,7 @@ impl fmt::Debug for Config {
            .field("wasm_reference_types", &self.features.reference_types)
            .field("wasm_bulk_memory", &self.features.bulk_memory)
            .field("wasm_simd", &self.features.simd)
+            .field("wasm_relaxed_simd", &self.features.relaxed_simd)
            .field("wasm_multi_value", &self.features.multi_value)
            .field(
                "static_memory_maximum_size",
--- a/crates/wasmtime/src/engine/serialization.rs
+++ b/crates/wasmtime/src/engine/serialization.rs
@@ -309,6 +309,7 @@ impl Metadata {
            epoch_interruption,
            static_memory_bound_is_maximum,
            guard_before_linear_memory,
+            relaxed_simd_deterministic,

            // This doesn't affect compilation, it's just a runtime setting.
            dynamic_memory_growth_reserve: _,
@@ -364,6 +365,11 @@ impl Metadata {
            other.guard_before_linear_memory,
            "guard before linear memory",
        )?;
+        Self::check_bool(
+            relaxed_simd_deterministic,
+            other.relaxed_simd_deterministic,
+            "relaxed simd deterministic semantics",
+        )?;

        Ok(())
    }
--- a/crates/wast/src/core.rs
+++ b/crates/wast/src/core.rs
@@ -39,6 +39,14 @@ fn extract_lane_as_i64(bytes: u128, lane: usize) -> i64 {

 pub fn match_val(actual: &Val, expected: &WastRetCore) -> Result<()> {
    match (actual, expected) {
+        (_, WastRetCore::Either(expected)) => {
+            for expected in expected {
+                if match_val(actual, expected).is_ok() {
+                    return Ok(());
+                }
+            }
+            match_val(actual, &expected[0])
+        }
        (Val::I32(a), WastRetCore::I32(b)) => match_int(a, b),
        (Val::I64(a), WastRetCore::I64(b)) => match_int(a, b),
        // Note that these float comparisons are comparing bits, not float
--- a/tests/all/wast.rs
+++ b/tests/all/wast.rs
@@ -30,6 +30,7 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()>
    let multi_memory = feature_found(wast, "multi-memory");
    let threads = feature_found(wast, "threads");
    let reference_types = !(threads && feature_found(wast, "proposals"));
+    let relaxed_simd = feature_found(wast, "relaxed-simd");
    let use_shared_memory = feature_found_src(&wast_bytes, "shared_memory")
        || feature_found_src(&wast_bytes, "shared)");

@@ -43,6 +44,7 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()>
        .wasm_threads(threads)
        .wasm_memory64(memory64)
        .wasm_reference_types(reference_types)
+        .wasm_relaxed_simd(relaxed_simd)
        .cranelift_debug_verifier(true);

    cfg.wasm_component_model(feature_found(wast, "component-model"));
@@ -108,11 +110,26 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()>
        None
    };

-    let store = Store::new(&Engine::new(&cfg)?, ());
-    let mut wast_context = WastContext::new(store);
+    let mut engines = vec![(Engine::new(&cfg)?, "default")];

-    wast_context.register_spectest(use_shared_memory)?;
-    wast_context.run_buffer(wast.to_str().unwrap(), &wast_bytes)?;
+    // For tests that use relaxed-simd test both the default engine and the
+    // guaranteed-deterministic engine to ensure that both the 'native'
+    // semantics of the instructions plus the canonical semantics work.
+    if relaxed_simd {
+        engines.push((
+            Engine::new(cfg.relaxed_simd_deterministic(true))?,
+            "deterministic",
+        ));
+    }
+
+    for (engine, desc) in engines {
+        let store = Store::new(&engine, ());
+        let mut wast_context = WastContext::new(store);
+        wast_context.register_spectest(use_shared_memory)?;
+        wast_context
+            .run_buffer(wast.to_str().unwrap(), &wast_bytes)
+            .with_context(|| format!("failed to run spec test with {desc} engine"))?;
+    }

    Ok(())
 }
--- a/tests/misc_testsuite/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
+++ b/tests/misc_testsuite/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
@@ -0,0 +1,26 @@
+;; Tests for i16x8.relaxed_q15mulr_s.
+(module
+    (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))
+
+    (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))))
+)
+
+;; INT16_MIN = -32768
+(assert_return (invoke "i16x8.relaxed_q15mulr_s"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0)
+                       (v128.const i16x8 32767 32767 32766 0 0 0 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
--- a/tests/misc_testsuite/relaxed-simd/i32x4_relaxed_trunc.wast
+++ b/tests/misc_testsuite/relaxed-simd/i32x4_relaxed_trunc.wast
@@ -0,0 +1,123 @@
+;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero.
+
+(module
+    (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))
+
+    (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))))
+)
+
+;; Test some edge cases around min/max to ensure that the instruction either
+;; saturates correctly or returns INT_MIN.
+;;
+;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is
+;; 2147483647 but that is not representable in a `f32` since it requires 31 bits
+;; when a f32 has only 24 bits available. This means that the closest integers
+;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning
+;; that the INT_MAX test case cannot be tested.
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       ;;                INT32_MIN     <INT32_MIN        >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647)
+                       (v128.const i32x4 -2147483648 -2147483648 2 -2147483648)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0 4294967040 0xffffffff)
+                       (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 2147483647 0 0)
+                       (v128.const i32x4 -2147483648 -2147483648 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0xffffffff 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0 0 0xffffffff 0xffffffff)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       ;; INT32_MIN <INT32_MIN INT32_MAX >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
--- a/tests/misc_testsuite/relaxed-simd/i8x16_relaxed_swizzle.wast
+++ b/tests/misc_testsuite/relaxed-simd/i8x16_relaxed_swizzle.wast
@@ -0,0 +1,44 @@
+;; Tests for relaxed i8x16 swizzle.
+
+(module
+    (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))
+
+    (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
--- a/tests/misc_testsuite/relaxed-simd/relaxed_dot_product.wast
+++ b/tests/misc_testsuite/relaxed-simd/relaxed_dot_product.wast
@@ -0,0 +1,106 @@
+;; Tests for relaxed dot products.
+
+(module
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))))
+)
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (v128.const i16x8 1 13 41 85 145 221 313 421))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -32512 32258 0 0 0 0 0 0))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either
+                 (v128.const i16x8 -32768 0 0 0 0 0 0 0)
+                 (v128.const i16x8  32512 0 0 0 0 0 0 0)
+                 (v128.const i16x8  33024 0 0 0 0 0 0 0)))
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i32x4 0 1 2 3))
+               ;; intermediate result is [14, 126, 366, 734]
+               (v128.const i32x4 14 127 368 737))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -65023 64518 3 4))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (either
+                 (v128.const i32x4 -66047 2 3 4)
+                 (v128.const i32x4 -65535 2 3 4)
+                 (v128.const i32x4  65025 2 3 4)
+                 (v128.const i32x4  66049 2 3 4)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (v128.const i32x4 -1 -1 -1 -1))
--- a/tests/misc_testsuite/relaxed-simd/relaxed_laneselect.wast
+++ b/tests/misc_testsuite/relaxed-simd/relaxed_laneselect.wast
@@ -0,0 +1,92 @@
+;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect.
+
+(module
+    (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i64x2.eq
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+)
+
+(assert_return (invoke "i8x16.relaxed_laneselect"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either (v128.const i8x16 0    17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0    17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31)))
+
+(assert_return (invoke "i16x8.relaxed_laneselect"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (either (v128.const i16x8 0      9 0x1278 0x5634 12 13 14 15)
+                       (v128.const i16x8 0      9 0x1234 0x5678 12 13 14 15)))
+
+(assert_return (invoke "i32x4.relaxed_laneselect"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (either (v128.const i32x4 0          5 0x12345678 0x56781234)
+                       (v128.const i32x4 0          5 0x12341234 0x56785678)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (either (v128.const i64x2 0                  3)
+                       (v128.const i64x2 0                  3)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (either (v128.const i64x2 0x1234123456785678 0x5678567812341234)
+                       (v128.const i64x2 0x1234123412341234 0x5678567856785678)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i8x16.relaxed_laneselect_cmp"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i16x8.relaxed_laneselect_cmp"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_laneselect_cmp"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (v128.const i64x2 -1 -1))
--- a/tests/misc_testsuite/relaxed-simd/relaxed_madd_nmadd.wast
+++ b/tests/misc_testsuite/relaxed-simd/relaxed_madd_nmadd.wast
@@ -0,0 +1,190 @@
+;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd.
+
+(module
+    (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+)
+
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127)
+                       (v128.const f32x4 inf inf inf inf)))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; fnma tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; fnma tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 inf inf)))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; fnma tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; fnma tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; fnma tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; fnma tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (v128.const i64x2 -1 -1))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; fnma tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; fnma tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
--- a/tests/misc_testsuite/relaxed-simd/relaxed_min_max.wast
+++ b/tests/misc_testsuite/relaxed-simd/relaxed_min_max.wast
@@ -0,0 +1,183 @@
+;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max.
+
+(module
+    (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1)))
+
+    (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_min (local.get 0) (local.get 1))
+            (f32x4.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_max (local.get 0) (local.get 1))
+            (f32x4.relaxed_max (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_min (local.get 0) (local.get 1))
+            (f64x2.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_max (local.get 0) (local.get 1))
+            (f64x2.relaxed_max (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 -0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 +0.0 +0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))