s390x: Support both big- and little-endian vector lane order (#4682)

This implements the s390x back-end portion of the solution for
https://github.com/bytecodealliance/wasmtime/issues/4566

We now support both big- and little-endian vector lane order
in code generation.  The order used for a function is determined
by the function's ABI: if it uses a Wasmtime ABI, it will use
little-endian lane order, and big-endian lane order otherwise.
(This ensures that all raw_bitcast instructions generated by
both wasmtime and other cranelift frontends can always be
implemented as a no-op.)

Lane order affects the implementation of a number of operations:
- Vector immediates
- Vector memory load / store (in big- and little-endian variants)
- Operations explicitly using lane numbers
  (insertlane, extractlane, shuffle, swizzle)
- Operations implicitly using lane numbers
  (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits)

In addition, when calling a function using a different lane order,
we need to lane-swap all vector values passed or returned in registers.

A small number of changes to common code were also needed:

- Ensure we always select a Wasmtime calling convention on s390x
  in crates/cranelift (func_signature).

- Fix vector immediates for filetests/runtests.  In PR #4427,
  I attempted to fix this by byte-swapping the V128 value, but
  with the new scheme, we'd instead need to perform a per-lane
  byte swap.  Since we do not know the actual type in write_to_slice
  and read_from_slice, this isn't easily possible.

  Revert this part of PR #4427 again, and instead just mark the
  memory buffer as little-endian when emitting the trampoline;
  the back-end will then emit correct code to load the constant.

- Change a runtest in simd-bitselect-to-vselect.clif to no longer
  make little-endian lane order assumptions.

- Remove runtests in simd-swizzle.clif that make little-endian
  lane order assumptions by relying on implicit type conversion
  when using a non-i16x8 swizzle result type (this feature should
  probably be removed anyway).

Tested with both wasmtime and cg_clif.
This commit is contained in:
Ulrich Weigand
2022-08-11 21:10:46 +02:00
committed by GitHub
parent c1c48b4386
commit 67870d1518
29 changed files with 6584 additions and 593 deletions

View File

@@ -28,8 +28,9 @@ mod emit_tests;
// Instructions (top level): definition
pub use crate::isa::s390x::lower::isle::generated_code::{
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp,
ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp,
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, LaneOrder, MInst as Inst,
RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp,
VecUnaryOp,
};
/// Additional information for (direct) Call instructions, left out of line to lower the size of
@@ -245,7 +246,19 @@ impl Inst {
// These are all part of VXRS_EXT2
Inst::VecLoadRev { .. }
| Inst::VecLoadByte16Rev { .. }
| Inst::VecLoadByte32Rev { .. }
| Inst::VecLoadByte64Rev { .. }
| Inst::VecLoadElt16Rev { .. }
| Inst::VecLoadElt32Rev { .. }
| Inst::VecLoadElt64Rev { .. }
| Inst::VecStoreRev { .. }
| Inst::VecStoreByte16Rev { .. }
| Inst::VecStoreByte32Rev { .. }
| Inst::VecStoreByte64Rev { .. }
| Inst::VecStoreElt16Rev { .. }
| Inst::VecStoreElt32Rev { .. }
| Inst::VecStoreElt64Rev { .. }
| Inst::VecLoadReplicateRev { .. }
| Inst::VecLoadLaneRev { .. }
| Inst::VecLoadLaneRevUndef { .. }
@@ -762,6 +775,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte16Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte32Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte64Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt16Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt32Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt64Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecStore { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
@@ -770,6 +807,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte16Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte32Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte64Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt16Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt32Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt64Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadReplicate { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
@@ -2476,10 +2537,23 @@ impl Inst {
op, rm, rn, tmp, rn, rm
)
}
&Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
&Inst::VecLoad { rd, ref mem }
| &Inst::VecLoadRev { rd, ref mem }
| &Inst::VecLoadByte16Rev { rd, ref mem }
| &Inst::VecLoadByte32Rev { rd, ref mem }
| &Inst::VecLoadByte64Rev { rd, ref mem }
| &Inst::VecLoadElt16Rev { rd, ref mem }
| &Inst::VecLoadElt32Rev { rd, ref mem }
| &Inst::VecLoadElt64Rev { rd, ref mem } => {
let opcode = match self {
&Inst::VecLoad { .. } => "vl",
&Inst::VecLoadRev { .. } => "vlbrq",
&Inst::VecLoadByte16Rev { .. } => "vlbrh",
&Inst::VecLoadByte32Rev { .. } => "vlbrf",
&Inst::VecLoadByte64Rev { .. } => "vlbrg",
&Inst::VecLoadElt16Rev { .. } => "vlerh",
&Inst::VecLoadElt32Rev { .. } => "vlerf",
&Inst::VecLoadElt64Rev { .. } => "vlerg",
_ => unreachable!(),
};
@@ -2489,10 +2563,23 @@ impl Inst {
let mem = mem.pretty_print_default();
format!("{}{} {}, {}", mem_str, opcode, rd, mem)
}
&Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => {
&Inst::VecStore { rd, ref mem }
| &Inst::VecStoreRev { rd, ref mem }
| &Inst::VecStoreByte16Rev { rd, ref mem }
| &Inst::VecStoreByte32Rev { rd, ref mem }
| &Inst::VecStoreByte64Rev { rd, ref mem }
| &Inst::VecStoreElt16Rev { rd, ref mem }
| &Inst::VecStoreElt32Rev { rd, ref mem }
| &Inst::VecStoreElt64Rev { rd, ref mem } => {
let opcode = match self {
&Inst::VecStore { .. } => "vst",
&Inst::VecStoreRev { .. } => "vstbrq",
&Inst::VecStoreByte16Rev { .. } => "vstbrh",
&Inst::VecStoreByte32Rev { .. } => "vstbrf",
&Inst::VecStoreByte64Rev { .. } => "vstbrg",
&Inst::VecStoreElt16Rev { .. } => "vsterh",
&Inst::VecStoreElt32Rev { .. } => "vsterf",
&Inst::VecStoreElt64Rev { .. } => "vsterg",
_ => unreachable!(),
};