This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif.
424 lines
6.2 KiB
Plaintext
424 lines
6.2 KiB
Plaintext
test compile precise-output
|
|
target s390x arch13
|
|
|
|
function %uload8x8_big(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = uload8x8 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuplhb %v24, %v3
|
|
; br %r14
|
|
|
|
function %uload16x4_big(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = uload16x4 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuplhh %v24, %v3
|
|
; br %r14
|
|
|
|
function %uload32x2_big(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = uload32x2 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuplhf %v24, %v3
|
|
; br %r14
|
|
|
|
function %sload8x8_big(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = sload8x8 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuphb %v24, %v3
|
|
; br %r14
|
|
|
|
function %sload16x4_big(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = sload16x4 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuphh %v24, %v3
|
|
; br %r14
|
|
|
|
function %sload32x2_big(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = sload32x2 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuphf %v24, %v3
|
|
; br %r14
|
|
|
|
function %load_i8x16_big(i64) -> i8x16 {
|
|
block0(v0: i64):
|
|
v1 = load.i8x16 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i16x8_big(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = load.i16x8 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i32x4_big(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = load.i32x4 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i64x2_big(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = load.i64x2 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i128_big(i64) -> i128 {
|
|
block0(v0: i64):
|
|
v1 = load.i128 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v5, 0(%r3)
|
|
; vst %v5, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_f32x4_big(i64) -> f32x4 {
|
|
block0(v0: i64):
|
|
v1 = load.f32x4 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_f64x2_big(i64) -> f64x2 {
|
|
block0(v0: i64):
|
|
v1 = load.f64x2 big v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i8x16_big(i8x16, i64) {
|
|
block0(v0: i8x16, v1: i64):
|
|
store.i8x16 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i16x8_big(i16x8, i64) {
|
|
block0(v0: i16x8, v1: i64):
|
|
store.i16x8 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i32x4_big(i32x4, i64) {
|
|
block0(v0: i32x4, v1: i64):
|
|
store.i32x4 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i64x2_big(i64x2, i64) {
|
|
block0(v0: i64x2, v1: i64):
|
|
store.i64x2 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i128_big(i128, i64) {
|
|
block0(v0: i128, v1: i64):
|
|
store.i128 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vl %v0, 0(%r2)
|
|
; vst %v0, 0(%r3)
|
|
; br %r14
|
|
|
|
function %store_f32x4_big(f32x4, i64) {
|
|
block0(v0: f32x4, v1: i64):
|
|
store.f32x4 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_f64x2_big(f64x2, i64) {
|
|
block0(v0: f64x2, v1: i64):
|
|
store.f64x2 big v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %uload8x8_little(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = uload8x8 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuplhb %v24, %v3
|
|
; br %r14
|
|
|
|
function %uload16x4_little(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = uload16x4 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; verllh %v5, %v3, 8
|
|
; vuplhh %v24, %v5
|
|
; br %r14
|
|
|
|
function %uload32x2_little(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = uload32x2 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlebrg %v3, 0(%r2), 0
|
|
; verllg %v5, %v3, 32
|
|
; vuplhf %v24, %v5
|
|
; br %r14
|
|
|
|
function %sload8x8_little(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = sload8x8 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; vuphb %v24, %v3
|
|
; br %r14
|
|
|
|
function %sload16x4_little(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = sload16x4 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; ld %f3, 0(%r2)
|
|
; verllh %v5, %v3, 8
|
|
; vuphh %v24, %v5
|
|
; br %r14
|
|
|
|
function %sload32x2_little(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = sload32x2 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlebrg %v3, 0(%r2), 0
|
|
; verllg %v5, %v3, 32
|
|
; vuphf %v24, %v5
|
|
; br %r14
|
|
|
|
function %load_i8x16_little(i64) -> i8x16 {
|
|
block0(v0: i64):
|
|
v1 = load.i8x16 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vl %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i16x8_little(i64) -> i16x8 {
|
|
block0(v0: i64):
|
|
v1 = load.i16x8 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrh %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i32x4_little(i64) -> i32x4 {
|
|
block0(v0: i64):
|
|
v1 = load.i32x4 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrf %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i64x2_little(i64) -> i64x2 {
|
|
block0(v0: i64):
|
|
v1 = load.i64x2 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrg %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_i128_little(i64) -> i128 {
|
|
block0(v0: i64):
|
|
v1 = load.i128 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrq %v5, 0(%r3)
|
|
; vst %v5, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_f32x4_little(i64) -> f32x4 {
|
|
block0(v0: i64):
|
|
v1 = load.f32x4 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrf %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %load_f64x2_little(i64) -> f64x2 {
|
|
block0(v0: i64):
|
|
v1 = load.f64x2 little v0
|
|
return v1
|
|
}
|
|
|
|
; block0:
|
|
; vlbrg %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i8x16_little(i8x16, i64) {
|
|
block0(v0: i8x16, v1: i64):
|
|
store.i8x16 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vst %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i16x8_little(i16x8, i64) {
|
|
block0(v0: i16x8, v1: i64):
|
|
store.i16x8 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vstbrh %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i32x4_little(i32x4, i64) {
|
|
block0(v0: i32x4, v1: i64):
|
|
store.i32x4 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vstbrf %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i64x2_little(i64x2, i64) {
|
|
block0(v0: i64x2, v1: i64):
|
|
store.i64x2 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vstbrg %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_i128_little(i128, i64) {
|
|
block0(v0: i128, v1: i64):
|
|
store.i128 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vl %v0, 0(%r2)
|
|
; vstbrq %v0, 0(%r3)
|
|
; br %r14
|
|
|
|
function %store_f32x4_little(f32x4, i64) {
|
|
block0(v0: f32x4, v1: i64):
|
|
store.f32x4 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vstbrf %v24, 0(%r2)
|
|
; br %r14
|
|
|
|
function %store_f64x2_little(f64x2, i64) {
|
|
block0(v0: f64x2, v1: i64):
|
|
store.f64x2 little v0, v1
|
|
return
|
|
}
|
|
|
|
; block0:
|
|
; vstbrg %v24, 0(%r2)
|
|
; br %r14
|
|
|