* Initial support for the Relaxed SIMD proposal This commit adds initial scaffolding and support for the Relaxed SIMD proposal for WebAssembly. Codegen support is supported on the x64 and AArch64 backends on this time. The purpose of this commit is to get all the boilerplate out of the way in terms of plumbing through a new feature, adding tests, etc. The tests are copied from the upstream repository at this time while the WebAssembly/testsuite repository hasn't been updated. A summary of changes made in this commit are: * Lowerings for all relaxed simd opcodes have been added, currently all exhibiting deterministic behavior. This means that few lowerings are optimal on the x86 backend, but on the AArch64 backend, for example, all lowerings should be optimal. * Support is added to codegen to, eventually, conditionally generate different code based on input codegen flags. This is intended to enable codegen to more efficient instructions on x86 by default, for example, while still allowing embedders to force architecture-independent semantics and behavior. One good example of this is the `f32x4.relaxed_fmadd` instruction which when deterministic forces the `fma` instruction, but otherwise if the backend doesn't have support for `fma` then intermediate operations are performed instead. * Lowerings of `iadd_pairwise` for `i16x8` and `i32x4` were added to the x86 backend as they're now exercised by the deterministic lowerings of relaxed simd instructions. * Sample codegen tests for added for x86 and aarch64 for some relaxed simd instructions. * Wasmtime embedder support for the relaxed-simd proposal and forcing determinism have been added to `Config` and the CLI. * Support has been added to the `*.wast` runtime execution for the `(either ...)` matcher used in the relaxed-simd proposal. * Tests for relaxed-simd are run both with a default `Engine` as well as a "force deterministic" `Engine` to test both configurations. * All tests from the upstream repository were copied into Wasmtime. These tests should be deleted when WebAssembly/testsuite is updated. * x64: Add x86-specific lowerings for relaxed simd This commit builds on the prior commit and adds an array of `x86_*` instructions to Cranelift which have semantics that match their corresponding x86 equivalents. Translation for relaxed simd is then additionally updated to conditionally generate different CLIF for relaxed simd instructions depending on whether the target is x86 or not. This means that for AArch64 no changes are made but for x86 most relaxed instructions now lower to some x86-equivalent with slightly different semantics than the "deterministic" lowering. * Add libcall support for fma to Wasmtime This will be required to implement the `f32x4.relaxed_madd` instruction (and others) when an x86 host doesn't specify the `has_fma` feature. * Ignore relaxed-simd tests on s390x and riscv64 * Enable relaxed-simd tests on s390x * Update cranelift/codegen/meta/src/shared/instructions.rs Co-authored-by: Andrew Brown <andrew.brown@intel.com> * Add a FIXME from review * Add notes about deterministic semantics * Don't default `has_native_fma` to `true` * Review comments and rebase fixes --------- Co-authored-by: Andrew Brown <andrew.brown@intel.com>
191 lines
11 KiB
Plaintext
191 lines
11 KiB
Plaintext
;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd.
|
|
|
|
(module
|
|
(func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
|
|
(func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
|
|
(func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
|
|
(func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
|
|
|
|
(func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
|
|
(f32x4.eq
|
|
(f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
|
|
(f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
|
|
(func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
|
|
(f32x4.eq
|
|
(f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
|
|
(f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
|
|
(func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
|
|
(f64x2.eq
|
|
(f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
|
|
(f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
|
|
(func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
|
|
(f64x2.eq
|
|
(f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
|
|
(f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
|
|
)
|
|
|
|
|
|
;; FLT_MAX == 0x1.fffffep+127
|
|
;; FLT_MAX * 2 - FLT_MAX ==
|
|
;; FLT_MAX (if fma)
|
|
;; 0 (if no fma)
|
|
;; from https://www.vinc17.net/software/fma-tests.c
|
|
(assert_return (invoke "f32x4.relaxed_madd"
|
|
(v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
|
|
(v128.const f32x4 2.0 2.0 2.0 2.0)
|
|
(v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
|
|
(either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127)
|
|
(v128.const f32x4 inf inf inf inf)))
|
|
|
|
;; Special values for float:
|
|
;; x = 0x1.000004p+0 (1 + 2^-22)
|
|
;; y = 0x1.0002p+0 (1 + 2^-15)
|
|
;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
|
|
;; = -0x1.000204p+0
|
|
;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
|
|
;; x.y+z = 0 (2 roundings)
|
|
;; fma(x, y, z) = (0x1p-37) 2^-37
|
|
;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
|
|
(assert_return (invoke "f32x4.relaxed_madd"
|
|
(v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
|
|
(v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
|
|
(v128.const f32x4 0 0 0 0)))
|
|
;; fnma tests with negated x, same answers are expected.
|
|
(assert_return (invoke "f32x4.relaxed_nmadd"
|
|
(v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
|
|
(v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
|
|
(v128.const f32x4 0 0 0 0)))
|
|
;; fnma tests with negated y, same answers are expected.
|
|
(assert_return (invoke "f32x4.relaxed_nmadd"
|
|
(v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
|
|
(v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
|
|
(v128.const f32x4 0 0 0 0)))
|
|
|
|
;; DBL_MAX = 0x1.fffffffffffffp+1023
|
|
;; DLB_MAX * 2 - DLB_MAX ==
|
|
;; DLB_MAX (if fma)
|
|
;; 0 (if no fma)
|
|
;; form https://www.vinc17.net/software/fma-tests.c
|
|
;; from https://www.vinc17.net/software/fma-tests.c
|
|
(assert_return (invoke "f64x2.relaxed_madd"
|
|
(v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
|
|
(v128.const f64x2 2.0 2.0)
|
|
(v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
|
|
(either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
|
|
(v128.const f64x2 inf inf)))
|
|
|
|
;; Special values for double:
|
|
;; x = 0x1.00000004p+0 (1 + 2^-30)
|
|
;; y = 0x1.000002p+0 (1 + 2^-23)
|
|
;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
|
|
;; = -0x1.00000204p+0
|
|
;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
|
|
;; x.y+z = 0 (2 roundings)
|
|
;; fma(x, y, z) = 0x1p-53
|
|
;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
|
|
(assert_return (invoke "f64x2.relaxed_madd"
|
|
(v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
|
|
(v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(either (v128.const f64x2 0x1p-53 0x1p-53)
|
|
(v128.const f64x2 0 0)))
|
|
;; fnma tests with negated x, same answers are expected.
|
|
(assert_return (invoke "f64x2.relaxed_nmadd"
|
|
(v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
|
|
(v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(either (v128.const f64x2 0x1p-53 0x1p-53)
|
|
(v128.const f64x2 0 0)))
|
|
;; fnma tests with negated y, same answers are expected.
|
|
(assert_return (invoke "f64x2.relaxed_nmadd"
|
|
(v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
|
|
(v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(either (v128.const f64x2 0x1p-53 0x1p-53)
|
|
(v128.const f64x2 0 0)))
|
|
|
|
;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
|
|
|
|
;; FLT_MAX == 0x1.fffffep+127
|
|
;; FLT_MAX * 2 - FLT_MAX ==
|
|
;; FLT_MAX (if fma)
|
|
;; 0 (if no fma)
|
|
;; from https://www.vinc17.net/software/fma-tests.c
|
|
(assert_return (invoke "f32x4.relaxed_madd_cmp"
|
|
(v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
|
|
(v128.const f32x4 2.0 2.0 2.0 2.0)
|
|
(v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
|
|
(v128.const i32x4 -1 -1 -1 -1))
|
|
|
|
;; Special values for float:
|
|
;; x = 0x1.000004p+0 (1 + 2^-22)
|
|
;; y = 0x1.0002p+0 (1 + 2^-15)
|
|
;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
|
|
;; = -0x1.000204p+0
|
|
;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
|
|
;; x.y+z = 0 (2 roundings)
|
|
;; fma(x, y, z) = (0x1p-37) 2^-37
|
|
;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
|
|
(assert_return (invoke "f32x4.relaxed_madd_cmp"
|
|
(v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
|
|
(v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(v128.const i32x4 -1 -1 -1 -1))
|
|
;; fnma tests with negated x, same answers are expected.
|
|
(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
|
|
(v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
|
|
(v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(v128.const i32x4 -1 -1 -1 -1))
|
|
;; fnma tests with negated y, same answers are expected.
|
|
(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
|
|
(v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
|
|
(v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
|
|
(v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
|
|
(v128.const i32x4 -1 -1 -1 -1))
|
|
|
|
;; DBL_MAX = 0x1.fffffffffffffp+1023
|
|
;; DLB_MAX * 2 - DLB_MAX ==
|
|
;; DLB_MAX (if fma)
|
|
;; 0 (if no fma)
|
|
;; form https://www.vinc17.net/software/fma-tests.c
|
|
;; from https://www.vinc17.net/software/fma-tests.c
|
|
(assert_return (invoke "f64x2.relaxed_madd_cmp"
|
|
(v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
|
|
(v128.const f64x2 2.0 2.0)
|
|
(v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
|
|
(v128.const i64x2 -1 -1))
|
|
|
|
;; Special values for double:
|
|
;; x = 0x1.00000004p+0 (1 + 2^-30)
|
|
;; y = 0x1.000002p+0 (1 + 2^-23)
|
|
;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
|
|
;; = -0x1.00000204p+0
|
|
;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
|
|
;; x.y+z = 0 (2 roundings)
|
|
;; fma(x, y, z) = 0x1p-53
|
|
;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
|
|
(assert_return (invoke "f64x2.relaxed_madd_cmp"
|
|
(v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
|
|
(v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(v128.const i64x2 -1 -1))
|
|
;; fnma tests with negated x, same answers are expected.
|
|
(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
|
|
(v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
|
|
(v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(v128.const i64x2 -1 -1))
|
|
;; fnma tests with negated y, same answers are expected.
|
|
(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
|
|
(v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
|
|
(v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
|
|
(v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
|
|
(v128.const i64x2 -1 -1))
|