This adds full support for all Cranelift SIMD instructions to the s390x target. Everything is matched fully via ISLE. In addition to adding support for many new instructions, and the lower.isle code to match all SIMD IR patterns, this patch also adds ABI support for vector types. In particular, we now need to handle the fact that vector registers 8 .. 15 are partially callee-saved, i.e. the high parts of those registers (which correspond to the old floating-poing registers) are callee-saved, but the low parts are not. This is the exact same situation that we already have on AArch64, and so this patch uses the same solution (the is_included_in_clobbers callback). The bulk of the changes are platform-specific, but there are a few exceptions: - Added ISLE extractors for the Immediate and Constant types, to enable matching the vconst and swizzle instructions. - Added a missing accessor for call_conv to ABISig. - Fixed endian conversion for vector types in data_value.rs to enable their use in runtests on the big-endian platforms. - Enabled (nearly) all SIMD runtests on s390x. [ Two test cases remain disabled due to vector shift count semantics, see below. ] - Enabled all Wasmtime SIMD tests on s390x. There are three minor issues, called out via FIXMEs below, which should be addressed in the future, but should not be blockers to getting this patch merged. I've opened the following issues to track them: - Vector shift count semantics https://github.com/bytecodealliance/wasmtime/issues/4424 - is_included_in_clobbers vs. link register https://github.com/bytecodealliance/wasmtime/issues/4425 - gen_constant callback https://github.com/bytecodealliance/wasmtime/issues/4426 All tests, including all newly enabled SIMD tests, pass on both z14 and z15 architectures.
133 lines
3.8 KiB
Plaintext
133 lines
3.8 KiB
Plaintext
test run
|
|
target aarch64
|
|
; target s390x FIXME: s390x implements modulo semantics for shift counts
|
|
set enable_simd
|
|
target x86_64 skylake
|
|
|
|
function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
|
|
block0(v0: i8x16, v1: i8x16, v2: i8x16):
|
|
v3 = bitselect v0, v1, v2
|
|
return v3
|
|
}
|
|
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
|
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
|
|
|
function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
|
|
block0(v1: i32x4, v2: i32x4):
|
|
; `make_trampoline` still does not know how to convert boolean vector types
|
|
; so we load the value directly here.
|
|
v0 = vconst.b32x4 [true true false false]
|
|
v3 = vselect v0, v1, v2
|
|
return v3
|
|
}
|
|
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
|
; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
|
|
|
|
|
|
|
|
; shift left
|
|
|
|
function %ishl_i8x16(i8x16, i32) -> i8x16 {
|
|
block0(v0: i8x16, v1: i32):
|
|
v2 = ishl v0, v1
|
|
return v2
|
|
}
|
|
; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
|
|
|
|
function %ishl_i16x8(i16x8, i32) -> i16x8 {
|
|
block0(v0: i16x8, v1: i32):
|
|
v2 = ishl v0, v1
|
|
return v2
|
|
}
|
|
; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
|
|
|
|
function %ishl_i32x4(i32x4, i32) -> i32x4 {
|
|
block0(v0: i32x4, v1: i32):
|
|
v2 = ishl v0, v1
|
|
return v2
|
|
}
|
|
; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
|
|
|
|
function %ishl_imm_i64x2(i64x2) -> i64x2 {
|
|
block0(v0: i64x2):
|
|
v2 = ishl_imm v0, 1
|
|
return v2
|
|
}
|
|
; run: %ishl_imm_i64x2([1 0]) == [2 0]
|
|
|
|
|
|
|
|
; shift right (logical)
|
|
|
|
function %ushr_i8x16(i8x16, i32) -> i8x16 {
|
|
block0(v0: i8x16, v1: i32):
|
|
v2 = ushr v0, v1
|
|
return v2
|
|
}
|
|
; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
|
|
|
function %ushr_i32x4(i32x4, i32) -> i32x4 {
|
|
block0(v0: i32x4, v1: i32):
|
|
v2 = ushr v0, v1
|
|
return v2
|
|
}
|
|
; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
|
|
|
|
function %ushr_i64x2(i64x2, i32) -> i64x2 {
|
|
block0(v0: i64x2, v1: i32):
|
|
v2 = ushr v0, v1
|
|
return v2
|
|
}
|
|
; run: %ushr_i64x2([1 2], 1) == [0 1]
|
|
|
|
|
|
|
|
; shift right (arithmetic)
|
|
|
|
function %sshr_i8x16(i8x16, i32) -> i8x16 {
|
|
block0(v0: i8x16, v1: i32):
|
|
v2 = sshr v0, v1
|
|
return v2
|
|
}
|
|
; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
|
|
|
function %sshr_i16x8(i16x8, i32) -> i16x8 {
|
|
block0(v0: i16x8, v1: i32):
|
|
v2 = sshr v0, v1
|
|
return v2
|
|
}
|
|
; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
|
|
; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
|
|
|
|
function %sshr_i32x4(i32x4, i32) -> i32x4 {
|
|
block0(v0: i32x4, v1: i32):
|
|
v2 = sshr v0, v1
|
|
return v2
|
|
}
|
|
; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
|
|
; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
|
|
|
|
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
|
block0(v0:i64x2, v1:i32):
|
|
v2 = sshr v0, v1
|
|
return v2
|
|
}
|
|
; run: %sshr_i64x2([1 -1], 0) == [1 -1]
|
|
; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
|
|
; run: %sshr_i64x2([2 -2], 1) == [1 -1]
|
|
; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
|
|
|
|
function %sshr_imm_i32x4(i32x4) -> i32x4 {
|
|
block0(v0: i32x4):
|
|
v1 = sshr_imm v0, 1
|
|
return v1
|
|
}
|
|
; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
|
|
|
|
function %sshr_imm_i16x8(i16x8) -> i16x8 {
|
|
block0(v0: i16x8):
|
|
v1 = sshr_imm v0, 1
|
|
return v1
|
|
}
|
|
; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
|