s390x: Support both big- and little-endian vector lane order (#4682)
This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif.
This commit is contained in:
@@ -6,8 +6,8 @@ pub mod generated_code;
|
||||
// Types that the generated ISLE code uses via `use super::*`.
|
||||
use crate::isa::s390x::abi::{S390xMachineDeps, REG_SAVE_AREA_SIZE};
|
||||
use crate::isa::s390x::inst::{
|
||||
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg,
|
||||
MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
|
||||
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, LaneOrder,
|
||||
MemArg, MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
|
||||
};
|
||||
use crate::isa::s390x::settings::Flags as IsaFlags;
|
||||
use crate::machinst::isle::*;
|
||||
@@ -102,6 +102,10 @@ where
|
||||
ABISig::from_func_sig::<S390xMachineDeps>(sig, self.flags).unwrap()
|
||||
}
|
||||
|
||||
fn abi_lane_order(&mut self, abi: &ABISig) -> LaneOrder {
|
||||
lane_order_for_call_conv(abi.call_conv())
|
||||
}
|
||||
|
||||
fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
|
||||
let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
|
||||
self.lower_ctx
|
||||
@@ -405,9 +409,36 @@ where
|
||||
UImm16Shifted::maybe_from_u64(n)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn lane_order(&mut self) -> Option<LaneOrder> {
|
||||
Some(lane_order_for_call_conv(self.lower_ctx.abi().call_conv()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 {
|
||||
ty.lane_count() as u8 - 1 - idx
|
||||
match self.lane_order().unwrap() {
|
||||
LaneOrder::LittleEndian => ty.lane_count() as u8 - 1 - idx,
|
||||
LaneOrder::BigEndian => idx,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn be_vec_const(&mut self, ty: Type, n: u128) -> u128 {
|
||||
match self.lane_order().unwrap() {
|
||||
LaneOrder::LittleEndian => n,
|
||||
LaneOrder::BigEndian => {
|
||||
let lane_count = ty.lane_count();
|
||||
let lane_bits = ty.lane_bits();
|
||||
let lane_mask = (1u128 << lane_bits) - 1;
|
||||
let mut n_le = n;
|
||||
let mut n_be = 0u128;
|
||||
for _ in 0..lane_count {
|
||||
n_be = (n_be << lane_bits) | (n_le & lane_mask);
|
||||
n_le = n_le >> lane_bits;
|
||||
}
|
||||
n_be
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -419,17 +450,19 @@ where
|
||||
|
||||
#[inline]
|
||||
fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) {
|
||||
let bytes = idx.to_be_bytes();
|
||||
let bytes = match self.lane_order().unwrap() {
|
||||
LaneOrder::LittleEndian => idx.to_be_bytes().map(|x| {
|
||||
if x < 16 {
|
||||
15 - x
|
||||
} else if x < 32 {
|
||||
47 - x
|
||||
} else {
|
||||
128
|
||||
}
|
||||
}),
|
||||
LaneOrder::BigEndian => idx.to_le_bytes().map(|x| if x < 32 { x } else { 128 }),
|
||||
};
|
||||
let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16);
|
||||
let bytes = bytes.map(|x| {
|
||||
if x < 16 {
|
||||
15 - x
|
||||
} else if x < 32 {
|
||||
47 - x
|
||||
} else {
|
||||
128
|
||||
}
|
||||
});
|
||||
let permute_mask = u128::from_be_bytes(bytes);
|
||||
(permute_mask, and_mask)
|
||||
}
|
||||
@@ -813,6 +846,16 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Lane order to be used for a given calling convention.
|
||||
#[inline]
|
||||
fn lane_order_for_call_conv(call_conv: CallConv) -> LaneOrder {
|
||||
if call_conv.extends_wasmtime() {
|
||||
LaneOrder::LittleEndian
|
||||
} else {
|
||||
LaneOrder::BigEndian
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-extend the low `from_bits` bits of `value` to a full u64.
|
||||
#[inline]
|
||||
fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 {
|
||||
|
||||
Reference in New Issue
Block a user