s390x: Support both big- and little-endian vector lane order (#4682)

This implements the s390x back-end portion of the solution for
https://github.com/bytecodealliance/wasmtime/issues/4566

We now support both big- and little-endian vector lane order
in code generation.  The order used for a function is determined
by the function's ABI: if it uses a Wasmtime ABI, it will use
little-endian lane order, and big-endian lane order otherwise.
(This ensures that all raw_bitcast instructions generated by
both wasmtime and other cranelift frontends can always be
implemented as a no-op.)

Lane order affects the implementation of a number of operations:
- Vector immediates
- Vector memory load / store (in big- and little-endian variants)
- Operations explicitly using lane numbers
  (insertlane, extractlane, shuffle, swizzle)
- Operations implicitly using lane numbers
  (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits)

In addition, when calling a function using a different lane order,
we need to lane-swap all vector values passed or returned in registers.

A small number of changes to common code were also needed:

- Ensure we always select a Wasmtime calling convention on s390x
  in crates/cranelift (func_signature).

- Fix vector immediates for filetests/runtests.  In PR #4427,
  I attempted to fix this by byte-swapping the V128 value, but
  with the new scheme, we'd instead need to perform a per-lane
  byte swap.  Since we do not know the actual type in write_to_slice
  and read_from_slice, this isn't easily possible.

  Revert this part of PR #4427 again, and instead just mark the
  memory buffer as little-endian when emitting the trampoline;
  the back-end will then emit correct code to load the constant.

- Change a runtest in simd-bitselect-to-vselect.clif to no longer
  make little-endian lane order assumptions.

- Remove runtests in simd-swizzle.clif that make little-endian
  lane order assumptions by relying on implicit type conversion
  when using a non-i16x8 swizzle result type (this feature should
  probably be removed anyway).

Tested with both wasmtime and cg_clif.
This commit is contained in:
Ulrich Weigand
2022-08-11 21:10:46 +02:00
committed by GitHub
parent c1c48b4386
commit 67870d1518
29 changed files with 6584 additions and 593 deletions

View File

@@ -6,8 +6,8 @@ pub mod generated_code;
// Types that the generated ISLE code uses via `use super::*`.
use crate::isa::s390x::abi::{S390xMachineDeps, REG_SAVE_AREA_SIZE};
use crate::isa::s390x::inst::{
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg,
MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, LaneOrder,
MemArg, MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
};
use crate::isa::s390x::settings::Flags as IsaFlags;
use crate::machinst::isle::*;
@@ -102,6 +102,10 @@ where
ABISig::from_func_sig::<S390xMachineDeps>(sig, self.flags).unwrap()
}
fn abi_lane_order(&mut self, abi: &ABISig) -> LaneOrder {
lane_order_for_call_conv(abi.call_conv())
}
fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
self.lower_ctx
@@ -405,9 +409,36 @@ where
UImm16Shifted::maybe_from_u64(n)
}
#[inline]
fn lane_order(&mut self) -> Option<LaneOrder> {
Some(lane_order_for_call_conv(self.lower_ctx.abi().call_conv()))
}
#[inline]
fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 {
ty.lane_count() as u8 - 1 - idx
match self.lane_order().unwrap() {
LaneOrder::LittleEndian => ty.lane_count() as u8 - 1 - idx,
LaneOrder::BigEndian => idx,
}
}
#[inline]
fn be_vec_const(&mut self, ty: Type, n: u128) -> u128 {
match self.lane_order().unwrap() {
LaneOrder::LittleEndian => n,
LaneOrder::BigEndian => {
let lane_count = ty.lane_count();
let lane_bits = ty.lane_bits();
let lane_mask = (1u128 << lane_bits) - 1;
let mut n_le = n;
let mut n_be = 0u128;
for _ in 0..lane_count {
n_be = (n_be << lane_bits) | (n_le & lane_mask);
n_le = n_le >> lane_bits;
}
n_be
}
}
}
#[inline]
@@ -419,17 +450,19 @@ where
#[inline]
fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) {
let bytes = idx.to_be_bytes();
let bytes = match self.lane_order().unwrap() {
LaneOrder::LittleEndian => idx.to_be_bytes().map(|x| {
if x < 16 {
15 - x
} else if x < 32 {
47 - x
} else {
128
}
}),
LaneOrder::BigEndian => idx.to_le_bytes().map(|x| if x < 32 { x } else { 128 }),
};
let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16);
let bytes = bytes.map(|x| {
if x < 16 {
15 - x
} else if x < 32 {
47 - x
} else {
128
}
});
let permute_mask = u128::from_be_bytes(bytes);
(permute_mask, and_mask)
}
@@ -813,6 +846,16 @@ where
}
}
/// Lane order to be used for a given calling convention.
#[inline]
fn lane_order_for_call_conv(call_conv: CallConv) -> LaneOrder {
if call_conv.extends_wasmtime() {
LaneOrder::LittleEndian
} else {
LaneOrder::BigEndian
}
}
/// Zero-extend the low `from_bits` bits of `value` to a full u64.
#[inline]
fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 {