Files
wasmtime/cranelift/filetests/filetests/isa/s390x/vec-conversions-le-lane.clif
Ulrich Weigand 67870d1518 s390x: Support both big- and little-endian vector lane order (#4682)
This implements the s390x back-end portion of the solution for
https://github.com/bytecodealliance/wasmtime/issues/4566

We now support both big- and little-endian vector lane order
in code generation.  The order used for a function is determined
by the function's ABI: if it uses a Wasmtime ABI, it will use
little-endian lane order, and big-endian lane order otherwise.
(This ensures that all raw_bitcast instructions generated by
both wasmtime and other cranelift frontends can always be
implemented as a no-op.)

Lane order affects the implementation of a number of operations:
- Vector immediates
- Vector memory load / store (in big- and little-endian variants)
- Operations explicitly using lane numbers
  (insertlane, extractlane, shuffle, swizzle)
- Operations implicitly using lane numbers
  (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits)

In addition, when calling a function using a different lane order,
we need to lane-swap all vector values passed or returned in registers.

A small number of changes to common code were also needed:

- Ensure we always select a Wasmtime calling convention on s390x
  in crates/cranelift (func_signature).

- Fix vector immediates for filetests/runtests.  In PR #4427,
  I attempted to fix this by byte-swapping the V128 value, but
  with the new scheme, we'd instead need to perform a per-lane
  byte swap.  Since we do not know the actual type in write_to_slice
  and read_from_slice, this isn't easily possible.

  Revert this part of PR #4427 again, and instead just mark the
  memory buffer as little-endian when emitting the trampoline;
  the back-end will then emit correct code to load the constant.

- Change a runtest in simd-bitselect-to-vselect.clif to no longer
  make little-endian lane order assumptions.

- Remove runtests in simd-swizzle.clif that make little-endian
  lane order assumptions by relying on implicit type conversion
  when using a non-i16x8 swizzle result type (this feature should
  probably be removed anyway).

Tested with both wasmtime and cg_clif.
2022-08-11 12:10:46 -07:00

223 lines
4.0 KiB
Plaintext

test compile precise-output
target s390x
function %snarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v {
block0(v0: i64x2, v1: i64x2):
v2 = snarrow.i64x2 v0, v1
return v2
}
; block0:
; vpksg %v24, %v25, %v24
; br %r14
function %snarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v {
block0(v0: i32x4, v1: i32x4):
v2 = snarrow.i32x4 v0, v1
return v2
}
; block0:
; vpksf %v24, %v25, %v24
; br %r14
function %snarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v {
block0(v0: i16x8, v1: i16x8):
v2 = snarrow.i16x8 v0, v1
return v2
}
; block0:
; vpksh %v24, %v25, %v24
; br %r14
function %unarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v {
block0(v0: i64x2, v1: i64x2):
v2 = unarrow.i64x2 v0, v1
return v2
}
; block0:
; vgbm %v5, 0
; vmxg %v7, %v24, %v5
; vmxg %v17, %v25, %v5
; vpklsg %v24, %v17, %v7
; br %r14
function %unarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v {
block0(v0: i32x4, v1: i32x4):
v2 = unarrow.i32x4 v0, v1
return v2
}
; block0:
; vgbm %v5, 0
; vmxf %v7, %v24, %v5
; vmxf %v17, %v25, %v5
; vpklsf %v24, %v17, %v7
; br %r14
function %unarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v {
block0(v0: i16x8, v1: i16x8):
v2 = unarrow.i16x8 v0, v1
return v2
}
; block0:
; vgbm %v5, 0
; vmxh %v7, %v24, %v5
; vmxh %v17, %v25, %v5
; vpklsh %v24, %v17, %v7
; br %r14
function %uunarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v {
block0(v0: i64x2, v1: i64x2):
v2 = uunarrow.i64x2 v0, v1
return v2
}
; block0:
; vpklsg %v24, %v25, %v24
; br %r14
function %uunarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v {
block0(v0: i32x4, v1: i32x4):
v2 = uunarrow.i32x4 v0, v1
return v2
}
; block0:
; vpklsf %v24, %v25, %v24
; br %r14
function %uunarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v {
block0(v0: i16x8, v1: i16x8):
v2 = uunarrow.i16x8 v0, v1
return v2
}
; block0:
; vpklsh %v24, %v25, %v24
; br %r14
function %swiden_low_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v {
block0(v0: i32x4):
v1 = swiden_low.i32x4 v0
return v1
}
; block0:
; vuplf %v24, %v24
; br %r14
function %swiden_low_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v {
block0(v0: i16x8):
v1 = swiden_low.i16x8 v0
return v1
}
; block0:
; vuplh %v24, %v24
; br %r14
function %swiden_low_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v {
block0(v0: i8x16):
v1 = swiden_low.i8x16 v0
return v1
}
; block0:
; vuplb %v24, %v24
; br %r14
function %swiden_high_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v {
block0(v0: i32x4):
v1 = swiden_high.i32x4 v0
return v1
}
; block0:
; vuphf %v24, %v24
; br %r14
function %swiden_high_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v {
block0(v0: i16x8):
v1 = swiden_high.i16x8 v0
return v1
}
; block0:
; vuphh %v24, %v24
; br %r14
function %swiden_high_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v {
block0(v0: i8x16):
v1 = swiden_high.i8x16 v0
return v1
}
; block0:
; vuphb %v24, %v24
; br %r14
function %uwiden_low_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v {
block0(v0: i32x4):
v1 = uwiden_low.i32x4 v0
return v1
}
; block0:
; vupllf %v24, %v24
; br %r14
function %uwiden_low_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v {
block0(v0: i16x8):
v1 = uwiden_low.i16x8 v0
return v1
}
; block0:
; vupllh %v24, %v24
; br %r14
function %uwiden_low_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v {
block0(v0: i8x16):
v1 = uwiden_low.i8x16 v0
return v1
}
; block0:
; vupllb %v24, %v24
; br %r14
function %uwiden_high_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v {
block0(v0: i32x4):
v1 = uwiden_high.i32x4 v0
return v1
}
; block0:
; vuplhf %v24, %v24
; br %r14
function %uwiden_high_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v {
block0(v0: i16x8):
v1 = uwiden_high.i16x8 v0
return v1
}
; block0:
; vuplhh %v24, %v24
; br %r14
function %uwiden_high_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v {
block0(v0: i8x16):
v1 = uwiden_high.i8x16 v0
return v1
}
; block0:
; vuplhb %v24, %v24
; br %r14