s390x: Support both big- and little-endian vector lane order (#4682)
This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif.
This commit is contained in:
@@ -28,8 +28,9 @@ mod emit_tests;
|
||||
// Instructions (top level): definition
|
||||
|
||||
pub use crate::isa::s390x::lower::isle::generated_code::{
|
||||
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp,
|
||||
ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp,
|
||||
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, LaneOrder, MInst as Inst,
|
||||
RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp,
|
||||
VecUnaryOp,
|
||||
};
|
||||
|
||||
/// Additional information for (direct) Call instructions, left out of line to lower the size of
|
||||
@@ -245,7 +246,19 @@ impl Inst {
|
||||
|
||||
// These are all part of VXRS_EXT2
|
||||
Inst::VecLoadRev { .. }
|
||||
| Inst::VecLoadByte16Rev { .. }
|
||||
| Inst::VecLoadByte32Rev { .. }
|
||||
| Inst::VecLoadByte64Rev { .. }
|
||||
| Inst::VecLoadElt16Rev { .. }
|
||||
| Inst::VecLoadElt32Rev { .. }
|
||||
| Inst::VecLoadElt64Rev { .. }
|
||||
| Inst::VecStoreRev { .. }
|
||||
| Inst::VecStoreByte16Rev { .. }
|
||||
| Inst::VecStoreByte32Rev { .. }
|
||||
| Inst::VecStoreByte64Rev { .. }
|
||||
| Inst::VecStoreElt16Rev { .. }
|
||||
| Inst::VecStoreElt32Rev { .. }
|
||||
| Inst::VecStoreElt64Rev { .. }
|
||||
| Inst::VecLoadReplicateRev { .. }
|
||||
| Inst::VecLoadLaneRev { .. }
|
||||
| Inst::VecLoadLaneRevUndef { .. }
|
||||
@@ -762,6 +775,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadByte16Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadByte32Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadByte64Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadElt16Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadElt32Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadElt64Rev { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStore { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
@@ -770,6 +807,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreByte16Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreByte32Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreByte64Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreElt16Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreElt32Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecStoreElt64Rev { rd, ref mem, .. } => {
|
||||
collector.reg_use(rd);
|
||||
memarg_operands(mem, collector);
|
||||
}
|
||||
&Inst::VecLoadReplicate { rd, ref mem, .. } => {
|
||||
collector.reg_def(rd);
|
||||
memarg_operands(mem, collector);
|
||||
@@ -2476,10 +2537,23 @@ impl Inst {
|
||||
op, rm, rn, tmp, rn, rm
|
||||
)
|
||||
}
|
||||
&Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
|
||||
&Inst::VecLoad { rd, ref mem }
|
||||
| &Inst::VecLoadRev { rd, ref mem }
|
||||
| &Inst::VecLoadByte16Rev { rd, ref mem }
|
||||
| &Inst::VecLoadByte32Rev { rd, ref mem }
|
||||
| &Inst::VecLoadByte64Rev { rd, ref mem }
|
||||
| &Inst::VecLoadElt16Rev { rd, ref mem }
|
||||
| &Inst::VecLoadElt32Rev { rd, ref mem }
|
||||
| &Inst::VecLoadElt64Rev { rd, ref mem } => {
|
||||
let opcode = match self {
|
||||
&Inst::VecLoad { .. } => "vl",
|
||||
&Inst::VecLoadRev { .. } => "vlbrq",
|
||||
&Inst::VecLoadByte16Rev { .. } => "vlbrh",
|
||||
&Inst::VecLoadByte32Rev { .. } => "vlbrf",
|
||||
&Inst::VecLoadByte64Rev { .. } => "vlbrg",
|
||||
&Inst::VecLoadElt16Rev { .. } => "vlerh",
|
||||
&Inst::VecLoadElt32Rev { .. } => "vlerf",
|
||||
&Inst::VecLoadElt64Rev { .. } => "vlerg",
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -2489,10 +2563,23 @@ impl Inst {
|
||||
let mem = mem.pretty_print_default();
|
||||
format!("{}{} {}, {}", mem_str, opcode, rd, mem)
|
||||
}
|
||||
&Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => {
|
||||
&Inst::VecStore { rd, ref mem }
|
||||
| &Inst::VecStoreRev { rd, ref mem }
|
||||
| &Inst::VecStoreByte16Rev { rd, ref mem }
|
||||
| &Inst::VecStoreByte32Rev { rd, ref mem }
|
||||
| &Inst::VecStoreByte64Rev { rd, ref mem }
|
||||
| &Inst::VecStoreElt16Rev { rd, ref mem }
|
||||
| &Inst::VecStoreElt32Rev { rd, ref mem }
|
||||
| &Inst::VecStoreElt64Rev { rd, ref mem } => {
|
||||
let opcode = match self {
|
||||
&Inst::VecStore { .. } => "vst",
|
||||
&Inst::VecStoreRev { .. } => "vstbrq",
|
||||
&Inst::VecStoreByte16Rev { .. } => "vstbrh",
|
||||
&Inst::VecStoreByte32Rev { .. } => "vstbrf",
|
||||
&Inst::VecStoreByte64Rev { .. } => "vstbrg",
|
||||
&Inst::VecStoreElt16Rev { .. } => "vsterh",
|
||||
&Inst::VecStoreElt32Rev { .. } => "vsterf",
|
||||
&Inst::VecStoreElt64Rev { .. } => "vsterg",
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user