This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif.
214 lines
3.7 KiB
Plaintext
214 lines
3.7 KiB
Plaintext
test compile precise-output
|
|
target s390x
|
|
|
|
function %vconst_i64x2_zero() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [0 0]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vgbm %v24, 0
|
|
; br %r14
|
|
|
|
function %vconst_i64x2_splat1() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [32767 32767]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepig %v24, 32767
|
|
; br %r14
|
|
|
|
function %vconst_i64x2_splat2() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [-32768 -32768]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepig %v24, -32768
|
|
; br %r14
|
|
|
|
function %vconst_i64x2_splat3() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [32768 32768]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 12 ; data.u64 0x0000000000008000 ; vlrepg %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i64x2_splat4() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [-32769 -32769]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 12 ; data.u64 0xffffffffffff7fff ; vlrepg %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i64x2_mixed() -> i64x2 {
|
|
block0:
|
|
v1 = vconst.i64x2 [1 2]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 20 ; data.u128 0x00000000000000010000000000000002 ; vl %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_zero() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [0 0 0 0]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vgbm %v24, 0
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_splat1() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [32767 32767 32767 32767]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepif %v24, 32767
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_splat2() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [-32768 -32768 -32768 -32768]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepif %v24, -32768
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_splat3() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [32768 32768 32768 32768]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 8 ; data.u32 0x00008000 ; vlrepf %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_splat4() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [-32769 -32769 -32769 -32769]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 8 ; data.u32 0xffff7fff ; vlrepf %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_splat_i64() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [1 2 1 2]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 12 ; data.u64 0x0000000100000002 ; vlrepg %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i32x4_mixed() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [1 2 3 4]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 20 ; data.u128 0x00000001000000020000000300000004 ; vl %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i16x8_zero() -> i16x8 {
|
|
block0:
|
|
v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vgbm %v24, 0
|
|
; br %r14
|
|
|
|
function %vconst_i16x8_splat1() -> i16x8 {
|
|
block0:
|
|
v1 = vconst.i16x8 [32767 32767 32767 32767 32767 32767 32767 32767]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepih %v24, 32767
|
|
; br %r14
|
|
|
|
function %vconst_i16x8_splat2() -> i16x8 {
|
|
block0:
|
|
v1 = vconst.i16x8 [-32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepih %v24, -32768
|
|
; br %r14
|
|
|
|
function %vconst_i16x8_mixed() -> i16x8 {
|
|
block0:
|
|
v1 = vconst.i16x8 [1 2 3 4 5 6 7 8]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 20 ; data.u128 0x00010002000300040005000600070008 ; vl %v24, 0(%r1)
|
|
; br %r14
|
|
|
|
function %vconst_i8x16_zero() -> i8x16 {
|
|
block0:
|
|
v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vgbm %v24, 0
|
|
; br %r14
|
|
|
|
function %vconst_i8x16_splat1() -> i8x16 {
|
|
block0:
|
|
v1 = vconst.i8x16 [127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepib %v24, 127
|
|
; br %r14
|
|
|
|
function %vconst_i8x16_splat2() -> i8x16 {
|
|
block0:
|
|
v1 = vconst.i8x16 [-128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vrepib %v24, 128
|
|
; br %r14
|
|
|
|
function %vconst_i8x16_mixed() -> i8x16 {
|
|
block0:
|
|
v1 = vconst.i8x16 [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; bras %r1, 20 ; data.u128 0x0102030405060708090a0b0c0d0e0f10 ; vl %v24, 0(%r1)
|
|
; br %r14
|
|
|