s390x: Support both big- and little-endian vector lane order (#4682)

This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif.
2022-08-11 21:10:46 +02:00
parent c1c48b4386
commit 67870d1518
29 changed files with 6584 additions and 593 deletions
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -6,8 +6,8 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use crate::isa::s390x::abi::{S390xMachineDeps, REG_SAVE_AREA_SIZE};
 use crate::isa::s390x::inst::{
-    gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg,
-    MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
+    gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, LaneOrder,
+    MemArg, MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
 };
 use crate::isa::s390x::settings::Flags as IsaFlags;
 use crate::machinst::isle::*;
@@ -102,6 +102,10 @@ where
        ABISig::from_func_sig::<S390xMachineDeps>(sig, self.flags).unwrap()
    }

+    fn abi_lane_order(&mut self, abi: &ABISig) -> LaneOrder {
+        lane_order_for_call_conv(abi.call_conv())
+    }
+
    fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
        let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
        self.lower_ctx
@@ -405,9 +409,36 @@ where
        UImm16Shifted::maybe_from_u64(n)
    }

+    #[inline]
+    fn lane_order(&mut self) -> Option<LaneOrder> {
+        Some(lane_order_for_call_conv(self.lower_ctx.abi().call_conv()))
+    }
+
    #[inline]
    fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 {
-        ty.lane_count() as u8 - 1 - idx
+        match self.lane_order().unwrap() {
+            LaneOrder::LittleEndian => ty.lane_count() as u8 - 1 - idx,
+            LaneOrder::BigEndian => idx,
+        }
+    }
+
+    #[inline]
+    fn be_vec_const(&mut self, ty: Type, n: u128) -> u128 {
+        match self.lane_order().unwrap() {
+            LaneOrder::LittleEndian => n,
+            LaneOrder::BigEndian => {
+                let lane_count = ty.lane_count();
+                let lane_bits = ty.lane_bits();
+                let lane_mask = (1u128 << lane_bits) - 1;
+                let mut n_le = n;
+                let mut n_be = 0u128;
+                for _ in 0..lane_count {
+                    n_be = (n_be << lane_bits) | (n_le & lane_mask);
+                    n_le = n_le >> lane_bits;
+                }
+                n_be
+            }
+        }
    }

    #[inline]
@@ -419,17 +450,19 @@ where

    #[inline]
    fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) {
-        let bytes = idx.to_be_bytes();
+        let bytes = match self.lane_order().unwrap() {
+            LaneOrder::LittleEndian => idx.to_be_bytes().map(|x| {
+                if x < 16 {
+                    15 - x
+                } else if x < 32 {
+                    47 - x
+                } else {
+                    128
+                }
+            }),
+            LaneOrder::BigEndian => idx.to_le_bytes().map(|x| if x < 32 { x } else { 128 }),
+        };
        let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16);
-        let bytes = bytes.map(|x| {
-            if x < 16 {
-                15 - x
-            } else if x < 32 {
-                47 - x
-            } else {
-                128
-            }
-        });
        let permute_mask = u128::from_be_bytes(bytes);
        (permute_mask, and_mask)
    }
@@ -813,6 +846,16 @@ where
    }
 }

+/// Lane order to be used for a given calling convention.
+#[inline]
+fn lane_order_for_call_conv(call_conv: CallConv) -> LaneOrder {
+    if call_conv.extends_wasmtime() {
+        LaneOrder::LittleEndian
+    } else {
+        LaneOrder::BigEndian
+    }
+}
+
 /// Zero-extend the low `from_bits` bits of `value` to a full u64.
 #[inline]
 fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 {