Files
wasmtime/cranelift/filetests/filetests/runtests/simd-lane-access.clif
Ulrich Weigand 638dc4e0b3 s390x: Implement full SIMD support (#4427)
This adds full support for all Cranelift SIMD instructions
to the s390x target.  Everything is matched fully via ISLE.

In addition to adding support for many new instructions,
and the lower.isle code to match all SIMD IR patterns,
this patch also adds ABI support for vector types.
In particular, we now need to handle the fact that
vector registers 8 .. 15 are partially callee-saved,
i.e. the high parts of those registers (which correspond
to the old floating-poing registers) are callee-saved,
but the low parts are not.  This is the exact same situation
that we already have on AArch64, and so this patch uses the
same solution (the is_included_in_clobbers callback).

The bulk of the changes are platform-specific, but there are
a few exceptions:

- Added ISLE extractors for the Immediate and Constant types,
  to enable matching the vconst and swizzle instructions.

- Added a missing accessor for call_conv to ABISig.

- Fixed endian conversion for vector types in data_value.rs
  to enable their use in runtests on the big-endian platforms.

- Enabled (nearly) all SIMD runtests on s390x.  [ Two test cases
  remain disabled due to vector shift count semantics, see below. ]

- Enabled all Wasmtime SIMD tests on s390x.

There are three minor issues, called out via FIXMEs below,
which should be addressed in the future, but should not be
blockers to getting this patch merged.  I've opened the
following issues to track them:

- Vector shift count semantics
  https://github.com/bytecodealliance/wasmtime/issues/4424

- is_included_in_clobbers vs. link register
  https://github.com/bytecodealliance/wasmtime/issues/4425

- gen_constant callback
  https://github.com/bytecodealliance/wasmtime/issues/4426

All tests, including all newly enabled SIMD tests, pass
on both z14 and z15 architectures.
2022-07-18 14:00:48 -07:00

212 lines
5.5 KiB
Plaintext

test run
target aarch64
target s390x
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
;; shuffle
function %shuffle_different_ssa_values() -> i8x16 {
block0:
v0 = vconst.i8x16 0x00
v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31] ; use the first lane of v0 throughout except use the last lane of v1
return v2
}
; run: %shuffle_different_ssa_values() == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
function %shuffle_same_ssa_value() -> i8x16 {
block0:
v0 = vconst.i8x16 0x01000000_00000000_00000000_00000000 ; note where lane 15 is when written with hexadecimal syntax
v1 = shuffle v0, v0, 0x0f0f0f0f_0f0f0f0f_0f0f0f0f_0f0f0f0f ; use the last lane of v0 to fill all lanes
return v1
}
; run: %shuffle_same_ssa_value() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
function %shuffle_i32x4_in_same_place() -> i32x4 {
block0:
v1 = vconst.i32x4 [0 1 2 3]
v2 = raw_bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
; keep each lane in place from the first vector
v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v4 = raw_bitcast.i32x4 v3
return v4
}
; run: %shuffle_in_same_place() == [0 1 2 3]
function %shuffle_b32x4_to_all_true() -> i32x4 {
block0:
v1 = vconst.b32x4 [true false true false]
v2 = raw_bitcast.b8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
; pair up the true values to make the entire vector true
v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
v4 = raw_bitcast.i32x4 v3 ; TODO store.b32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
return v4
}
; run: %shuffle_b32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
;; swizzle
function %swizzle(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = swizzle.i8x16 v0, v1
return v2
}
; reverse the lanes, with over-large index 42 using lane 0
; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42]) == [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0)
; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0]) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
;; insertlane
function %insertlane_i8x16_first(i8x16, i8) -> i8x16 {
block0(v1: i8x16, v2: i8):
v3 = insertlane v1, v2, 0
return v3
}
; run: %insertlane_i8x16_first([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], 0xff) == [0xff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
function %insertlane_f32x4_second(f32x4, f32) -> f32x4 {
block0(v1: f32x4, v2: f32):
v3 = insertlane v1, v2, 1
return v3
}
; run: %insertlane_f32x4_second([0.0 0.0 0.0 0.0], 0x42.42) == [0.0 0x42.42 0.0 0.0]
function %insertlane_f64x2_first(f64x2, f64) -> f64x2 {
block0(v1: f64x2, v2: f64):
v3 = insertlane v1, v2, 0
return v3
}
; run: %insertlane_f64x2_first([0.0 0.0], 0x42.42) == [0x42.42 0.0]
function %insertlane_f64x2_second(f64x2, f64) -> f64x2 {
block0(v1: f64x2, v2: f64):
v3 = insertlane v1, v2, 1
return v3
}
; run: %insertlane_f64x2_second([0.0 0.0], 0x42.42) == [0.0 0x42.42]
;; extractlane
function %extractlane_b8x16() -> i8 {
block0:
v1 = vconst.b8x16 [false false false false false false false false false false true false false
false false false]
v2 = extractlane v1, 10
v3 = raw_bitcast.i8 v2
return v3
}
; run: %extractlane_b8x16_last() == 0xff
function %extractlane_i16x8_second(i16x8) -> i16 {
block0(v0: i16x8):
v1 = extractlane v0, 1
return v1
}
; run: %extractlane_i16x8_second(0x00080007000600050004000300020001) == 2
function %extractlane_f32x4_last(f32x4) -> f32 {
block0(v0: f32x4):
v1 = extractlane v0, 3
return v1
}
; run: %extractlane_f32x4_last([0x00.00 0x00.00 0x00.00 0x42.42]) == 0x42.42
function %extractlane_i32_with_vector_reuse() -> b1 {
block0:
v0 = iconst.i32 42
v1 = iconst.i32 99
v2 = vconst.i32x4 [42 42 42 42]
v3 = insertlane v2, v1, 2
v4 = extractlane v3, 3
v5 = icmp eq v4, v0
v6 = extractlane v3, 2
v7 = icmp eq v6, v1
v8 = band v5, v7
return v8
}
; run
function %extractlane_f32_with_vector_reuse() -> b1 {
block0:
v0 = f32const 0x42.42
v1 = f32const 0x99.99
v2 = vconst.f32x4 [0x42.42 0x42.42 0x42.42 0x42.42]
v3 = insertlane v2, v1, 2
v4 = extractlane v3, 3
v5 = fcmp eq v4, v0
v6 = extractlane v3, 2
v7 = fcmp eq v6, v1
v8 = band v5, v7
return v8
}
; run
;; splat
function %splat_i64x2() -> b1 {
block0:
v0 = iconst.i64 -1
v1 = splat.i64x2 v0
v2 = vconst.i64x2 [-1 -1]
v3 = icmp eq v1, v2
v8 = vall_true v3
return v8
}
; run
function %splat_i8(i8) -> i8x16 {
block0(v0: i8):
v1 = splat.i8x16 v0
return v1
}
; run: %splat_i8(0xff) == [0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff]
function %splat_i32(i32) -> i32x4 {
block0(v0: i32):
v1 = splat.i32x4 v0
return v1
}
; run: %splat_i32(42) == [42 42 42 42]
function %splat_f64(f64) -> f64x2 {
block0(v0: f64):
v1 = splat.f64x2 v0
return v1
}
; run: %splat_f64(-0x1.1) == [-0x1.1 -0x1.1]
; narrow
function %snarrow(i32x4, i32x4) -> i16x8 {
block0(v0: i32x4, v1: i32x4):
v2 = snarrow v0, v1
return v2
}
; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff]
function %unarrow(i32x4, i32x4) -> i16x8 {
block0(v0: i32x4, v1: i32x4):
v2 = unarrow v0, v1
return v2
}
; run: %unarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 0 0xffff 4 5 0 0]