s390x: Support both big- and little-endian vector lane order (#4682)

This implements the s390x back-end portion of the solution for
https://github.com/bytecodealliance/wasmtime/issues/4566

We now support both big- and little-endian vector lane order
in code generation.  The order used for a function is determined
by the function's ABI: if it uses a Wasmtime ABI, it will use
little-endian lane order, and big-endian lane order otherwise.
(This ensures that all raw_bitcast instructions generated by
both wasmtime and other cranelift frontends can always be
implemented as a no-op.)

Lane order affects the implementation of a number of operations:
- Vector immediates
- Vector memory load / store (in big- and little-endian variants)
- Operations explicitly using lane numbers
  (insertlane, extractlane, shuffle, swizzle)
- Operations implicitly using lane numbers
  (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits)

In addition, when calling a function using a different lane order,
we need to lane-swap all vector values passed or returned in registers.

A small number of changes to common code were also needed:

- Ensure we always select a Wasmtime calling convention on s390x
  in crates/cranelift (func_signature).

- Fix vector immediates for filetests/runtests.  In PR #4427,
  I attempted to fix this by byte-swapping the V128 value, but
  with the new scheme, we'd instead need to perform a per-lane
  byte swap.  Since we do not know the actual type in write_to_slice
  and read_from_slice, this isn't easily possible.

  Revert this part of PR #4427 again, and instead just mark the
  memory buffer as little-endian when emitting the trampoline;
  the back-end will then emit correct code to load the constant.

- Change a runtest in simd-bitselect-to-vselect.clif to no longer
  make little-endian lane order assumptions.

- Remove runtests in simd-swizzle.clif that make little-endian
  lane order assumptions by relying on implicit type conversion
  when using a non-i16x8 swizzle result type (this feature should
  probably be removed anyway).

Tested with both wasmtime and cg_clif.
This commit is contained in:
Ulrich Weigand
2022-08-11 21:10:46 +02:00
committed by GitHub
parent c1c48b4386
commit 67870d1518
29 changed files with 6584 additions and 593 deletions

View File

@@ -91,8 +91,8 @@ impl DataValue {
DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]),
DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]),
DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]),
DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()),
DataValue::V64(v) => dst[..8].copy_from_slice(&u64::from_le_bytes(*v).to_ne_bytes()),
DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]),
DataValue::V64(v) => dst[..8].copy_from_slice(&v[..]),
_ => unimplemented!(),
};
}
@@ -124,11 +124,9 @@ impl DataValue {
}
_ if ty.is_vector() => {
if ty.bytes() == 16 {
DataValue::V128(
u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes(),
)
DataValue::V128(src[..16].try_into().unwrap())
} else if ty.bytes() == 8 {
DataValue::V64(u64::from_ne_bytes(src[..8].try_into().unwrap()).to_le_bytes())
DataValue::V64(src[..8].try_into().unwrap())
} else {
unimplemented!()
}

View File

@@ -635,6 +635,36 @@
(rd WritableReg)
(mem MemArg))
;; 8x16-bit byte-reversed vector load instruction.
(VecLoadByte16Rev
(rd WritableReg)
(mem MemArg))
;; 4x32-bit byte-reversed vector load instruction.
(VecLoadByte32Rev
(rd WritableReg)
(mem MemArg))
;; 2x64-bit byte-reversed vector load instruction.
(VecLoadByte64Rev
(rd WritableReg)
(mem MemArg))
;; 8x16-bit element-reversed vector load instruction.
(VecLoadElt16Rev
(rd WritableReg)
(mem MemArg))
;; 4x32-bit element-reversed vector load instruction.
(VecLoadElt32Rev
(rd WritableReg)
(mem MemArg))
;; 2x64-bit element-reversed vector load instruction.
(VecLoadElt64Rev
(rd WritableReg)
(mem MemArg))
;; 128-bit vector store instruction.
(VecStore
(rd Reg)
@@ -645,6 +675,36 @@
(rd Reg)
(mem MemArg))
;; 8x16-bit byte-reversed vector store instruction.
(VecStoreByte16Rev
(rd Reg)
(mem MemArg))
;; 4x32-bit byte-reversed vector store instruction.
(VecStoreByte32Rev
(rd Reg)
(mem MemArg))
;; 2x64-bit byte-reversed vector store instruction.
(VecStoreByte64Rev
(rd Reg)
(mem MemArg))
;; 8x16-bit element-reversed vector store instruction.
(VecStoreElt16Rev
(rd Reg)
(mem MemArg))
;; 4x32-bit element-reversed vector store instruction.
(VecStoreElt32Rev
(rd Reg)
(mem MemArg))
;; 2x64-bit element-reversed vector store instruction.
(VecStoreElt64Rev
(rd Reg)
(mem MemArg))
;; 128-bit vector load replicated element instruction.
(VecLoadReplicate
(size u32)
@@ -1350,6 +1410,51 @@
(extern extractor allow_div_traps allow_div_traps)
;; Helpers for SIMD lane number operations ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; There are two ways to map vector types onto the SIMD vector registers
;; supported by the ISA, differing by the way lanes are numbered. In
;; little-endian lane order, lane 0 of a multi-lane vector value resides
;; in the least-significant parts of a vector register (when interpreted
;; as holding a single $I128 value); in big-endian lane order, lane 0
;; instead resides in the most-significant parts of the register.
;;
;; As long as used consistently, output of cranelift may use either lane
;; order method to implement CLIF semantics. However, depending on the
;; particular use case, one or the other order will lead to more efficient
;; code. Therefore this back end supports both code generation options.
;;
;; Note that the ISA instructions use immediate lane number according
;; to big-endian lane order; so when using little-endian lane order,
;; immediate lane numbers have to be translated.
(type LaneOrder
(enum
(LittleEndian)
(BigEndian)))
;; Return the lane order to be used when compiling the current function.
;; This will be a property of the function ABI. Functions using the
;; the Wasmtime ABI will use little-endian lane order, functions using
;; other ABIs will big-endian lane order.
(decl pure lane_order () LaneOrder)
(extern constructor lane_order lane_order)
;; Check whether two lane order values are equal.
(decl pure lane_order_equal (LaneOrder LaneOrder) bool)
(rule (lane_order_equal (LaneOrder.LittleEndian) (LaneOrder.LittleEndian)) $true)
(rule (lane_order_equal (LaneOrder.LittleEndian) (LaneOrder.BigEndian)) $false)
(rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.LittleEndian)) $false)
(rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.BigEndian)) $true)
;; Convert a CLIF immediate lane index value to big-endian lane order.
(decl be_lane_idx (Type u8) u8)
(extern constructor be_lane_idx be_lane_idx)
;; Convert a CLIF immediate vector constant to big-endian lane order.
(decl be_vec_const (Type u128) u128)
(extern constructor be_vec_const be_vec_const)
;; Helpers for register numbers and types ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Hard-coded registers.
@@ -1441,11 +1546,6 @@
(u32_pair (u16_pair (u8_pair i j) (u8_pair k l))
(u16_pair (u8_pair m n) (u8_pair o p)))))
;; Convert a little-endian lane index to a big-endian lane index.
(decl be_lane_idx (Type u8) u8)
(extern constructor be_lane_idx be_lane_idx)
;; Construct a VGBM mask to set all bits in one lane of a vector.
(decl lane_byte_mask (Type u8) u16)
@@ -2298,6 +2398,48 @@
(_ Unit (emit (MInst.VecLoadRev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadByte16Rev` instructions.
(decl vec_load_byte16rev (Type MemArg) Reg)
(rule (vec_load_byte16rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadByte16Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadByte32Rev` instructions.
(decl vec_load_byte32rev (Type MemArg) Reg)
(rule (vec_load_byte32rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadByte32Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadByte64Rev` instructions.
(decl vec_load_byte64rev (Type MemArg) Reg)
(rule (vec_load_byte64rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadByte64Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadElt16Rev` instructions.
(decl vec_load_elt16rev (Type MemArg) Reg)
(rule (vec_load_elt16rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadElt16Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadElt32Rev` instructions.
(decl vec_load_elt32rev (Type MemArg) Reg)
(rule (vec_load_elt32rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadElt32Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecLoadElt64Rev` instructions.
(decl vec_load_elt64rev (Type MemArg) Reg)
(rule (vec_load_elt64rev ty addr)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.VecLoadElt64Rev dst addr))))
dst))
;; Helper for emitting `MInst.VecStore` instructions.
(decl vec_store (Reg MemArg) SideEffectNoResult)
(rule (vec_store src addr)
@@ -2308,6 +2450,36 @@
(rule (vec_storerev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreRev src addr)))
;; Helper for emitting `MInst.VecStoreByte16Rev` instructions.
(decl vec_store_byte16rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_byte16rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreByte16Rev src addr)))
;; Helper for emitting `MInst.VecStoreByte32Rev` instructions.
(decl vec_store_byte32rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_byte32rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreByte32Rev src addr)))
;; Helper for emitting `MInst.VecStoreByte64Rev` instructions.
(decl vec_store_byte64rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_byte64rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreByte64Rev src addr)))
;; Helper for emitting `MInst.VecStoreElt16Rev` instructions.
(decl vec_store_elt16rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_elt16rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreElt16Rev src addr)))
;; Helper for emitting `MInst.VecStoreElt32Rev` instructions.
(decl vec_store_elt32rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_elt32rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreElt32Rev src addr)))
;; Helper for emitting `MInst.VecStoreElt64Rev` instructions.
(decl vec_store_elt64rev (Reg MemArg) SideEffectNoResult)
(rule (vec_store_elt64rev src addr)
(SideEffectNoResult.Inst (MInst.VecStoreElt64Rev src addr)))
;; Helper for emitting `MInst.VecLoadReplicate` instructions.
(decl vec_load_replicate (Type MemArg) Reg)
(rule (vec_load_replicate (ty_vec128 ty @ (multi_lane size _)) addr)
@@ -2660,6 +2832,34 @@
(rule (emit_arg_load $F64 mem) (vec_load_lane_undef $F64X2 mem 0))
(rule (emit_arg_load (vr128_ty ty) mem) (vec_load ty mem))
;; Helper to perform a lane swap in register.
(decl vec_elt_rev (Type Reg) Reg)
(rule (vec_elt_rev (multi_lane 64 2) reg)
(vec_permute_dw_imm $I64X2 reg 1 reg 0))
(rule (vec_elt_rev (multi_lane 32 4) reg)
(let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0)))
(vec_rot_imm $I64X2 rev 32)))
(rule (vec_elt_rev (multi_lane 16 8) reg)
(let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0)))
(vec_rot_imm $I32X4 (vec_rot_imm $I64X2 rev 32) 16)))
(rule (vec_elt_rev (multi_lane 8 16) reg)
(let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0)))
(vec_rot_imm $I16X8 (vec_rot_imm $I32X4 (vec_rot_imm $I64X2 rev 32) 16) 8)))
;; When passing a vector value in register to a function whose ABI uses
;; a different lane order than the current function, we need to swap lanes.
;; The first operand is the lane order used by the callee.
(decl abi_vec_elt_rev (LaneOrder Type Reg) Reg)
(rule (abi_vec_elt_rev _ (gpr32_ty ty) reg) reg)
(rule (abi_vec_elt_rev _ (gpr64_ty ty) reg) reg)
(rule (abi_vec_elt_rev _ (ty_scalar_float ty) reg) reg)
(rule (abi_vec_elt_rev callee_lane_order _ reg)
(if-let $true (lane_order_equal callee_lane_order (lane_order)))
reg)
(rule (abi_vec_elt_rev callee_lane_order (vr128_ty ty) reg)
(if-let $false (lane_order_equal callee_lane_order (lane_order)))
(vec_elt_rev ty reg))
;; Helpers to emit a memory copy (MVC or memcpy libcall).
(decl emit_memcpy (MemArg MemArg u64) Unit)
(rule (emit_memcpy dst src (len_minus_one len))
@@ -2688,34 +2888,34 @@
;; Copy a single argument/return value to its slots.
;; For oversized arguments, set the slot to the buffer address.
(decl copy_to_arg (i64 ABIArg Value) Unit)
(rule (copy_to_arg base (abi_arg_only_slot slot) val)
(copy_val_to_arg_slot base slot val))
(rule (copy_to_arg base (abi_arg_struct_pointer slot offset _) _)
(decl copy_to_arg (LaneOrder i64 ABIArg Value) Unit)
(rule (copy_to_arg lo base (abi_arg_only_slot slot) val)
(copy_val_to_arg_slot lo base slot val))
(rule (copy_to_arg _ base (abi_arg_struct_pointer slot offset _) _)
(let ((ptr Reg (load_addr (memarg_stack_off base offset))))
(copy_reg_to_arg_slot base slot ptr)))
(rule (copy_to_arg base (abi_arg_implicit_pointer slot offset _) _)
(rule (copy_to_arg _ base (abi_arg_implicit_pointer slot offset _) _)
(let ((ptr Reg (load_addr (memarg_stack_off base offset))))
(copy_reg_to_arg_slot base slot ptr)))
;; Copy a single argument/return value from its slots.
(decl copy_from_arg (i64 ABIArg) ValueRegs)
(rule (copy_from_arg base (abi_arg_only_slot slot))
(value_reg (copy_reg_from_arg_slot base slot)))
(decl copy_from_arg (LaneOrder i64 ABIArg) ValueRegs)
(rule (copy_from_arg lo base (abi_arg_only_slot slot))
(value_reg (copy_reg_from_arg_slot lo base slot)))
;; Copy one component of an argument/return value to its slot.
(decl copy_val_to_arg_slot (i64 ABIArgSlot Value) Unit)
(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg ty (ArgumentExtension.None)) val)
(emit_mov ty (real_reg_to_writable_reg reg) val))
(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Uext)) val)
(decl copy_val_to_arg_slot (LaneOrder i64 ABIArgSlot Value) Unit)
(rule (copy_val_to_arg_slot lo _ (ABIArgSlot.Reg reg ty (ArgumentExtension.None)) val)
(emit_mov ty (real_reg_to_writable_reg reg) (abi_vec_elt_rev lo ty val)))
(rule (copy_val_to_arg_slot _ _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Uext)) val)
(emit_put_in_reg_zext64 (real_reg_to_writable_reg reg) val))
(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Sext)) val)
(rule (copy_val_to_arg_slot _ _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Sext)) val)
(emit_put_in_reg_sext64 (real_reg_to_writable_reg reg) val))
(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset ty (ArgumentExtension.None)) val)
(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset ty (ArgumentExtension.None)) val)
(emit_arg_store ty val (memarg_stack_off base offset)))
(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset _ (ArgumentExtension.Uext)) val)
(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset _ (ArgumentExtension.Uext)) val)
(emit_arg_store $I64 (put_in_reg_zext64 val) (memarg_stack_off base offset)))
(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset _ (ArgumentExtension.Sext)) val)
(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset _ (ArgumentExtension.Sext)) val)
(emit_arg_store $I64 (put_in_reg_sext64 val) (memarg_stack_off base offset)))
;; Copy one component of an argument/return value to its slot, where the
@@ -2727,10 +2927,10 @@
(emit_arg_store (abi_ext_ty ext ty) src (memarg_stack_off base offset)))
;; Copy one component of an argument/return value from its slot.
(decl copy_reg_from_arg_slot (i64 ABIArgSlot) Reg)
(rule (copy_reg_from_arg_slot _ (ABIArgSlot.Reg reg ty ext))
(copy_reg (abi_ext_ty ext ty) (real_reg_to_reg reg)))
(rule (copy_reg_from_arg_slot base (ABIArgSlot.Stack offset ty ext))
(decl copy_reg_from_arg_slot (LaneOrder i64 ABIArgSlot) Reg)
(rule (copy_reg_from_arg_slot lo _ (ABIArgSlot.Reg reg ty ext))
(abi_vec_elt_rev lo ty (copy_reg (abi_ext_ty ext ty) (real_reg_to_reg reg))))
(rule (copy_reg_from_arg_slot _ base (ABIArgSlot.Stack offset ty ext))
(emit_arg_load (abi_ext_ty ext ty) (memarg_stack_off base offset)))
;; Helper to compute the type of an implicitly extended argument/return value.
@@ -2882,13 +3082,6 @@
(rule (ty_ext64 $I32) $I64)
(rule (ty_ext64 $I64) $I64)
;; 128-bit vector type with lane type `Type`.
(decl ty_vec128_from_lane_ty (Type) Type)
(rule (ty_vec128_from_lane_ty $I8) $I8X16)
(rule (ty_vec128_from_lane_ty $I16) $I16X8)
(rule (ty_vec128_from_lane_ty $I32) $I32X4)
(rule (ty_vec128_from_lane_ty $I64) $I64X2)
;; Zero-extend a register from a smaller `Type` into a 32-bit destination. (Non-SSA form.)
;; This handles both integer and boolean input types.
(decl emit_zext32_reg (WritableReg Type Reg) Unit)
@@ -3440,6 +3633,9 @@
(decl abi_accumulate_outgoing_args_size (ABISig) Unit)
(extern constructor abi_accumulate_outgoing_args_size abi_accumulate_outgoing_args_size)
(decl abi_lane_order (ABISig) LaneOrder)
(extern constructor abi_lane_order abi_lane_order)
;; Helpers for generating calls to library routines ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3525,6 +3721,72 @@
(decl vec_unpacku_high (Type Reg) Reg)
(rule (vec_unpacku_high ty x) (vec_rr ty (vecop_unpacku_high ty) x))
;; Versions of pack using current lane order semantics.
;; First source operand contains values that will end up in the
;; lower-numbered lanes of the result, second operand contains
;; values that will end up in the higher-numbered lanes.
(decl vec_pack_lane_order (Type Reg Reg) Reg)
(rule (vec_pack_lane_order ty x y)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_pack ty x y))
(rule (vec_pack_lane_order ty x y)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_pack ty y x))
(decl vec_pack_ssat_lane_order (Type Reg Reg) Reg)
(rule (vec_pack_ssat_lane_order ty x y)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_pack_ssat ty x y))
(rule (vec_pack_ssat_lane_order ty x y)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_pack_ssat ty y x))
(decl vec_pack_usat_lane_order (Type Reg Reg) Reg)
(rule (vec_pack_usat_lane_order ty x y)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_pack_usat ty x y))
(rule (vec_pack_usat_lane_order ty x y)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_pack_usat ty y x))
;; Versions of unpack using current lane order semantics.
;; unpack_low will consume values from the lower-numbered
;; lanes of the input, and unpack_high will consume values
;; from higher-numbered lanes.
(decl vec_unpacks_low_lane_order (Type Reg) Reg)
(rule (vec_unpacks_low_lane_order ty x)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_unpacks_high ty x))
(rule (vec_unpacks_low_lane_order ty x)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_unpacks_low ty x))
(decl vec_unpacks_high_lane_order (Type Reg) Reg)
(rule (vec_unpacks_high_lane_order ty x)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_unpacks_low ty x))
(rule (vec_unpacks_high_lane_order ty x)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_unpacks_high ty x))
(decl vec_unpacku_low_lane_order (Type Reg) Reg)
(rule (vec_unpacku_low_lane_order ty x)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_unpacku_high ty x))
(rule (vec_unpacku_low_lane_order ty x)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_unpacku_low ty x))
(decl vec_unpacku_high_lane_order (Type Reg) Reg)
(rule (vec_unpacku_high_lane_order ty x)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_unpacku_low ty x))
(rule (vec_unpacku_high_lane_order ty x)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_unpacku_high ty x))
;; Helpers for generating vector merge instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3546,6 +3808,30 @@
(decl vec_merge_high (Type Reg Reg) Reg)
(rule (vec_merge_high ty x y) (vec_rrr ty (vecop_merge_high ty) x y))
;; Versions of merge using current lane order semantics.
;; merge_low will consume values from the lower-numbered
;; lanes of the inputs, and merge_high will consume values
;; from higher-numbered lanes. In both cases, values from
;; the first input will end up in even-numbered lanes, and
;; values from the second input will end up in odd-numbered
;; lanes of the output.
(decl vec_merge_low_lane_order (Type Reg Reg) Reg)
(rule (vec_merge_low_lane_order ty x y)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_merge_high ty x y))
(rule (vec_merge_low_lane_order ty x y)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_merge_low ty y x))
(decl vec_merge_high_lane_order (Type Reg Reg) Reg)
(rule (vec_merge_high_lane_order ty x y)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_merge_low ty x y))
(rule (vec_merge_high_lane_order ty x y)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_merge_high ty y x))
;; Helpers for generating `clz` and `ctz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -2839,24 +2839,50 @@ impl MachInstEmit for Inst {
inst.emit(&[], sink, emit_info, state);
}
&Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
&Inst::VecLoad { rd, ref mem }
| &Inst::VecLoadRev { rd, ref mem }
| &Inst::VecLoadByte16Rev { rd, ref mem }
| &Inst::VecLoadByte32Rev { rd, ref mem }
| &Inst::VecLoadByte64Rev { rd, ref mem }
| &Inst::VecLoadElt16Rev { rd, ref mem }
| &Inst::VecLoadElt32Rev { rd, ref mem }
| &Inst::VecLoadElt64Rev { rd, ref mem } => {
let rd = allocs.next_writable(rd);
let mem = mem.with_allocs(&mut allocs);
let (opcode, m3) = match self {
&Inst::VecLoad { .. } => (0xe706, 0), // VL
&Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ
&Inst::VecLoad { .. } => (0xe706, 0), // VL
&Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ
&Inst::VecLoadByte16Rev { .. } => (0xe606, 1), // VLBRH
&Inst::VecLoadByte32Rev { .. } => (0xe606, 2), // VLBRF
&Inst::VecLoadByte64Rev { .. } => (0xe606, 3), // VLBRG
&Inst::VecLoadElt16Rev { .. } => (0xe607, 1), // VLERH
&Inst::VecLoadElt32Rev { .. } => (0xe607, 2), // VLERF
&Inst::VecLoadElt64Rev { .. } => (0xe607, 3), // VLERG
_ => unreachable!(),
};
mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state);
}
&Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => {
&Inst::VecStore { rd, ref mem }
| &Inst::VecStoreRev { rd, ref mem }
| &Inst::VecStoreByte16Rev { rd, ref mem }
| &Inst::VecStoreByte32Rev { rd, ref mem }
| &Inst::VecStoreByte64Rev { rd, ref mem }
| &Inst::VecStoreElt16Rev { rd, ref mem }
| &Inst::VecStoreElt32Rev { rd, ref mem }
| &Inst::VecStoreElt64Rev { rd, ref mem } => {
let rd = allocs.next(rd);
let mem = mem.with_allocs(&mut allocs);
let (opcode, m3) = match self {
&Inst::VecStore { .. } => (0xe70e, 0), // VST
&Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ
&Inst::VecStore { .. } => (0xe70e, 0), // VST
&Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ
&Inst::VecStoreByte16Rev { .. } => (0xe60e, 1), // VSTBRH
&Inst::VecStoreByte32Rev { .. } => (0xe60e, 2), // VSTBRF
&Inst::VecStoreByte64Rev { .. } => (0xe60e, 3), // VSTBRG
&Inst::VecStoreElt16Rev { .. } => (0xe60f, 1), // VSTERH
&Inst::VecStoreElt32Rev { .. } => (0xe60f, 2), // VSTERF
&Inst::VecStoreElt64Rev { .. } => (0xe60f, 3), // VSTERG
_ => unreachable!(),
};
mem_vrx_emit(rd, &mem, opcode, m3, true, sink, emit_info, state);

View File

@@ -10091,6 +10091,240 @@ fn test_s390x_binemit() {
"E61230004806",
"vlbrq %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadByte16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020001806",
"vlbrh %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadByte16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF1806",
"vlbrh %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadByte16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230001806",
"vlbrh %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadByte32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020002806",
"vlbrf %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadByte32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF2806",
"vlbrf %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadByte32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230002806",
"vlbrf %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadByte64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020003806",
"vlbrg %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadByte64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF3806",
"vlbrg %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadByte64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230003806",
"vlbrg %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadElt16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020001807",
"vlerh %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadElt16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF1807",
"vlerh %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadElt16Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230001807",
"vlerh %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadElt32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020002807",
"vlerf %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadElt32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF2807",
"vlerf %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadElt32Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230002807",
"vlerf %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadElt64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61020003807",
"vlerg %v17, 0(%r2)",
));
insns.push((
Inst::VecLoadElt64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF3807",
"vlerg %v17, 4095(%r2)",
));
insns.push((
Inst::VecLoadElt64Rev {
rd: writable_vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E61230003807",
"vlerg %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStore {
rd: vr(17),
@@ -10169,6 +10403,240 @@ fn test_s390x_binemit() {
"E6123000480E",
"vstbrq %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreByte16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000180E",
"vstbrh %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreByte16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF180E",
"vstbrh %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreByte16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000180E",
"vstbrh %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreByte32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000280E",
"vstbrf %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreByte32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF280E",
"vstbrf %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreByte32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000280E",
"vstbrf %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreByte64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000380E",
"vstbrg %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreByte64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF380E",
"vstbrg %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreByte64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000380E",
"vstbrg %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreElt16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000180F",
"vsterh %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreElt16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF180F",
"vsterh %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreElt16Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000180F",
"vsterh %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreElt32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000280F",
"vsterf %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreElt32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF280F",
"vsterf %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreElt32Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000280F",
"vsterf %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecStoreElt64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6102000380F",
"vsterg %v17, 0(%r2)",
));
insns.push((
Inst::VecStoreElt64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(2),
index: zero_reg(),
disp: UImm12::maybe_from_u64(4095).unwrap(),
flags: MemFlags::trusted(),
},
},
"E6102FFF380F",
"vsterg %v17, 4095(%r2)",
));
insns.push((
Inst::VecStoreElt64Rev {
rd: vr(17),
mem: MemArg::BXD12 {
base: gpr(3),
index: gpr(2),
disp: UImm12::zero(),
flags: MemFlags::trusted(),
},
},
"E6123000380F",
"vsterg %v17, 0(%r2,%r3)",
));
insns.push((
Inst::VecLoadReplicate {
size: 8,

View File

@@ -28,8 +28,9 @@ mod emit_tests;
// Instructions (top level): definition
pub use crate::isa::s390x::lower::isle::generated_code::{
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp,
ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp,
ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, LaneOrder, MInst as Inst,
RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp,
VecUnaryOp,
};
/// Additional information for (direct) Call instructions, left out of line to lower the size of
@@ -245,7 +246,19 @@ impl Inst {
// These are all part of VXRS_EXT2
Inst::VecLoadRev { .. }
| Inst::VecLoadByte16Rev { .. }
| Inst::VecLoadByte32Rev { .. }
| Inst::VecLoadByte64Rev { .. }
| Inst::VecLoadElt16Rev { .. }
| Inst::VecLoadElt32Rev { .. }
| Inst::VecLoadElt64Rev { .. }
| Inst::VecStoreRev { .. }
| Inst::VecStoreByte16Rev { .. }
| Inst::VecStoreByte32Rev { .. }
| Inst::VecStoreByte64Rev { .. }
| Inst::VecStoreElt16Rev { .. }
| Inst::VecStoreElt32Rev { .. }
| Inst::VecStoreElt64Rev { .. }
| Inst::VecLoadReplicateRev { .. }
| Inst::VecLoadLaneRev { .. }
| Inst::VecLoadLaneRevUndef { .. }
@@ -762,6 +775,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte16Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte32Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadByte64Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt16Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt32Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadElt64Rev { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
}
&Inst::VecStore { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
@@ -770,6 +807,30 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte16Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte32Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreByte64Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt16Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt32Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecStoreElt64Rev { rd, ref mem, .. } => {
collector.reg_use(rd);
memarg_operands(mem, collector);
}
&Inst::VecLoadReplicate { rd, ref mem, .. } => {
collector.reg_def(rd);
memarg_operands(mem, collector);
@@ -2476,10 +2537,23 @@ impl Inst {
op, rm, rn, tmp, rn, rm
)
}
&Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
&Inst::VecLoad { rd, ref mem }
| &Inst::VecLoadRev { rd, ref mem }
| &Inst::VecLoadByte16Rev { rd, ref mem }
| &Inst::VecLoadByte32Rev { rd, ref mem }
| &Inst::VecLoadByte64Rev { rd, ref mem }
| &Inst::VecLoadElt16Rev { rd, ref mem }
| &Inst::VecLoadElt32Rev { rd, ref mem }
| &Inst::VecLoadElt64Rev { rd, ref mem } => {
let opcode = match self {
&Inst::VecLoad { .. } => "vl",
&Inst::VecLoadRev { .. } => "vlbrq",
&Inst::VecLoadByte16Rev { .. } => "vlbrh",
&Inst::VecLoadByte32Rev { .. } => "vlbrf",
&Inst::VecLoadByte64Rev { .. } => "vlbrg",
&Inst::VecLoadElt16Rev { .. } => "vlerh",
&Inst::VecLoadElt32Rev { .. } => "vlerf",
&Inst::VecLoadElt64Rev { .. } => "vlerg",
_ => unreachable!(),
};
@@ -2489,10 +2563,23 @@ impl Inst {
let mem = mem.pretty_print_default();
format!("{}{} {}, {}", mem_str, opcode, rd, mem)
}
&Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => {
&Inst::VecStore { rd, ref mem }
| &Inst::VecStoreRev { rd, ref mem }
| &Inst::VecStoreByte16Rev { rd, ref mem }
| &Inst::VecStoreByte32Rev { rd, ref mem }
| &Inst::VecStoreByte64Rev { rd, ref mem }
| &Inst::VecStoreElt16Rev { rd, ref mem }
| &Inst::VecStoreElt32Rev { rd, ref mem }
| &Inst::VecStoreElt64Rev { rd, ref mem } => {
let opcode = match self {
&Inst::VecStore { .. } => "vst",
&Inst::VecStoreRev { .. } => "vstbrq",
&Inst::VecStoreByte16Rev { .. } => "vstbrh",
&Inst::VecStoreByte32Rev { .. } => "vstbrf",
&Inst::VecStoreByte64Rev { .. } => "vstbrg",
&Inst::VecStoreElt16Rev { .. } => "vsterh",
&Inst::VecStoreElt32Rev { .. } => "vsterf",
&Inst::VecStoreElt64Rev { .. } => "vsterg",
_ => unreachable!(),
};

View File

@@ -39,7 +39,7 @@
;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty (vconst (u128_from_constant x))))
(vec_imm ty x))
(vec_imm ty (be_vec_const ty x)))
;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -148,9 +148,9 @@
;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers.
(rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y)))
(let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits))))
(vec_pack (vec_widen_type ty)
(vec_add ty y (vec_lshr_by_byte y size))
(vec_add ty x (vec_lshr_by_byte x size)))))
(vec_pack_lane_order (vec_widen_type ty)
(vec_add ty x (vec_lshr_by_byte x size))
(vec_add ty y (vec_lshr_by_byte y size)))))
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -929,9 +929,14 @@
(put_in_reg_zext64 x))
;; 128-bit target types.
(rule (lower (has_type (vr128_ty _ty) (uextend x @ (value_type src_ty))))
(let ((ty Type (ty_vec128_from_lane_ty src_ty)))
(vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg))))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I8))))
(vec_insert_lane $I8X16 (vec_imm ty 0) x 15 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I16))))
(vec_insert_lane $I16X8 (vec_imm ty 0) x 7 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I32))))
(vec_insert_lane $I32X4 (vec_imm ty 0) x 3 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I64))))
(vec_insert_lane $I64X2 (vec_imm ty 0) x 1 (zero_reg)))
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -953,44 +958,44 @@
;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y))
(vec_pack_ssat ty y x))
(vec_pack_ssat_lane_order ty x y))
;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y))
(vec_pack_usat ty y x))
(vec_pack_usat_lane_order ty x y))
;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y))
(let ((zero Reg (vec_imm ty 0)))
(vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero))))
(vec_pack_usat_lane_order ty (vec_smax ty x zero) (vec_smax ty y zero))))
;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (swiden_low x @ (value_type (ty_vec128 ty))))
(vec_unpacks_low ty x))
(vec_unpacks_low_lane_order ty x))
;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (swiden_high x @ (value_type (ty_vec128 ty))))
(vec_unpacks_high ty x))
(vec_unpacks_high_lane_order ty x))
;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uwiden_low x @ (value_type (ty_vec128 ty))))
(vec_unpacku_low ty x))
(vec_unpacku_low_lane_order ty x))
;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uwiden_high x @ (value_type (ty_vec128 ty))))
(vec_unpacku_high ty x))
(vec_unpacku_high_lane_order ty x))
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1583,7 +1588,7 @@
;; Promote a register.
(rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4))))
(fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x)))
(fpromote_reg $F64X2 $F32X4 (vec_merge_low_lane_order $I32X4 x x)))
;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1598,9 +1603,8 @@
;; Demote a register.
(rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2))))
(let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x)))
(vec_permute $F32X4 dst (vec_imm $F32X4 0)
(vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16
0 1 2 3 8 9 10 11)))))
(vec_pack_lane_order $I64X2 (vec_lshr_imm $I64X2 dst 32)
(vec_imm $I64X2 0))))
;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1690,7 +1694,7 @@
;; Convert the low half of a $I32X4 to a $F64X2.
(rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4))))
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven)
(vec_unpacks_low $I32X4 x)))
(vec_unpacks_low_lane_order $I32X4 x)))
;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1845,7 +1849,12 @@
;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Raw bitcast is always a no-op.
;; FIXME: There are two flavors of raw_bitcast, which are currently not
;; distinguished in CLIF IR. Those generated by Wasmtime assume little-endian
;; lane order, and those generated elsewhere assume big-endian lane order.
;; Raw bitcast is a no-op if current lane order matches that assumed lane order.
;; However, due to our choice of lane order depending on the current function
;; ABI, every bitcast we currently see here is indeed a no-op.
(rule (lower (raw_bitcast x)) x)
@@ -2352,9 +2361,20 @@
;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; We need to modify the lane mask at runtime in two ways:
;; - convert from little-endian to big-endian lane numbering
;; - handle mask elements outside the range 0..15 by zeroing the lane
;; When using big-endian lane order, the lane mask is mostly correct, but we
;; need to handle mask elements outside the range 0..15 by zeroing the lane.
;;
;; To do so efficiently, we compute:
;; permute-lane-element := umin (16, swizzle-lane-element)
;; and pass a zero vector as second operand to the permute instruction.
(rule (lower (has_type (ty_vec128 ty) (swizzle x y)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_permute ty x (vec_imm ty 0)
(vec_umin $I8X16 (vec_imm_splat $I8X16 16) y)))
;; When using little-endian lane order, in addition to zeroing (as above),
;; we need to convert from little-endian to big-endian lane numbering.
;;
;; To do so efficiently, we compute:
;; permute-lane-element := umax (239, ~ swizzle-lane-element)
@@ -2368,6 +2388,7 @@
;; to implement the required swizzle semantics.
(rule (lower (has_type (ty_vec128 ty) (swizzle x y)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_permute ty (vec_imm ty 0) x
(vec_umax $I8X16 (vec_imm_splat $I8X16 239)
(vec_not $I8X16 y))))
@@ -2485,18 +2506,36 @@
(rule (lower (has_type $F64 (load flags @ (littleendian) addr offset)))
(vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0))
;; Load 128-bit big-endian vector values.
;; Load 128-bit big-endian vector values, BE lane order - direct load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load ty (lower_address flags addr offset)))
;; Load 128-bit little-endian vector values (z15 instruction).
(rule (lower (has_type (and (vxrs_ext2_enabled) (vr128_ty ty))
(load flags @ (littleendian) addr offset)))
;; Load 128-bit little-endian vector values, BE lane order - byte-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_byte_rev ty flags addr offset))
;; Load 128-bit big-endian vector values, LE lane order - element-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_elt_rev ty flags addr offset))
;; Load 128-bit little-endian vector values, LE lane order - fully-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_full_rev ty flags addr offset))
;; Helper to perform a 128-bit full-vector byte-reversed load.
(decl vec_load_full_rev (Type MemFlags Value Offset32) Reg)
;; Full-vector byte-reversed load via single instruction on z15.
(rule (vec_load_full_rev (and (vxrs_ext2_enabled) (vr128_ty ty)) flags addr offset)
(vec_loadrev ty (lower_address flags addr offset)))
;; Load 128-bit little-endian vector values (via GPRs on z14).
(rule (lower (has_type (and (vxrs_ext2_disabled) (vr128_ty ty))
(load flags @ (littleendian) addr offset)))
;; Full-vector byte-reversed load via GPRs on z14.
(rule (vec_load_full_rev (and (vxrs_ext2_disabled) (vr128_ty ty)) flags addr offset)
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
(hi_addr MemArg (lower_address_bias flags addr offset 8))
(lo_val Reg (loadrev64 lo_addr))
@@ -2504,6 +2543,75 @@
(mov_to_vec128 ty hi_val lo_val)))
;; Helper to perform an element-wise byte-reversed load.
(decl vec_load_byte_rev (Type MemFlags Value Offset32) Reg)
;; Element-wise byte-reversed 1x128-bit load is a full byte-reversed load.
(rule (vec_load_byte_rev $I128 flags addr offset)
(vec_load_full_rev $I128 flags addr offset))
;; Element-wise byte-reversed 16x8-bit load is a direct load.
(rule (vec_load_byte_rev ty @ (multi_lane 8 16) flags addr offset)
(vec_load ty (lower_address flags addr offset)))
;; Element-wise byte-reversed load via single instruction on z15.
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_load_byte64rev ty (lower_address flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_load_byte32rev ty (lower_address flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_load_byte16rev ty (lower_address flags addr offset)))
;; Element-wise byte-reversed load as element-swapped byte-reversed load on z14.
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
;; Helper to perform an element-reversed load.
(decl vec_load_elt_rev (Type MemFlags Value Offset32) Reg)
;; Element-reversed 1x128-bit load is a direct load.
;; For 1x128-bit types, this is a direct load.
(rule (vec_load_elt_rev $I128 flags addr offset)
(vec_load $I128 (lower_address flags addr offset)))
;; Element-reversed 16x8-bit load is a full byte-reversed load.
(rule (vec_load_elt_rev ty @ (multi_lane 8 16) flags addr offset)
(vec_load_full_rev ty flags addr offset))
;; Element-reversed load via single instruction on z15.
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_load_elt64rev ty (lower_address flags addr offset)))
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_load_elt32rev ty (lower_address flags addr offset)))
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_load_elt16rev ty (lower_address flags addr offset)))
;; Element-reversed load as element-swapped direct load on z14.
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 16- or 32-bit target types.
@@ -2606,65 +2714,77 @@
;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Unsigned 8->16 bit extension, big-endian source value.
(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I8X16
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 8->16 bit extension.
(rule (lower (has_type $I16X8 (uload8x8 flags addr offset)))
(vec_unpacku_high $I8X16 (load_v64 $I8X16 flags addr offset)))
;; Unsigned 8->16 bit extension, little-endian source value.
(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I8X16
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 8->16 bit extension.
(rule (lower (has_type $I16X8 (sload8x8 flags addr offset)))
(vec_unpacks_high $I8X16 (load_v64 $I8X16 flags addr offset)))
;; Signed 8->16 bit extension, big-endian source value.
(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I8X16
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 16->32 bit extension.
(rule (lower (has_type $I32X4 (uload16x4 flags addr offset)))
(vec_unpacku_high $I16X8 (load_v64 $I16X8 flags addr offset)))
;; Signed 8->16 bit extension, little-endian source value.
(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I8X16
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 16->32 bit extension.
(rule (lower (has_type $I32X4 (sload16x4 flags addr offset)))
(vec_unpacks_high $I16X8 (load_v64 $I16X8 flags addr offset)))
;; Unsigned 16->32 bit extension, big-endian source value.
(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 32->64 bit extension.
(rule (lower (has_type $I64X2 (uload32x2 flags addr offset)))
(vec_unpacku_high $I32X4 (load_v64 $I32X4 flags addr offset)))
;; Unsigned 16->32 bit extension, little-endian source value.
(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 32->64 bit extension.
(rule (lower (has_type $I64X2 (sload32x2 flags addr offset)))
(vec_unpacks_high $I32X4 (load_v64 $I32X4 flags addr offset)))
;; Signed 16->32 bit extension, big-endian source value.
(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 16->32 bit extension, little-endian source value.
(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Helper to load a 64-bit half-size vector from memory.
(decl load_v64 (Type MemFlags Value Offset32) Reg)
;; Unsigned 32->64 bit extension, big-endian source value.
(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I32X4
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Any big-endian source value, BE lane order.
(rule (load_v64 _ flags @ (bigendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))
;; Unsigned 32->64 bit extension, little-endian source value.
(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I32X4
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Any little-endian source value, LE lane order.
(rule (load_v64 _ flags @ (littleendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))
;; Signed 32->64 bit extension, big-endian source value.
(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I32X4
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Big-endian or little-endian 8x8-bit source value, BE lane order.
(rule (load_v64 (multi_lane 8 16) flags addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))
;; Signed 32->64 bit extension, little-endian source value.
(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I32X4
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Big-endian or little-endian 8x8-bit source value, LE lane order.
(rule (load_v64 (multi_lane 8 16) flags addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))
;; Little-endian 4x16-bit source value, BE lane order.
(rule (load_v64 (multi_lane 16 8) flags @ (littleendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_rot_imm $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 8))
;; Big-endian 4x16-bit source value, LE lane order.
(rule (load_v64 (multi_lane 16 8) flags @ (bigendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_rot_imm $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 8))
;; Little-endian 2x32-bit source value, BE lane order.
(rule (load_v64 (multi_lane 32 4) flags @ (littleendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_rot_imm $I64X2
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 32))
;; Big-endian 2x32-bit source value, LE lane order.
(rule (load_v64 (multi_lane 32 4) flags @ (bigendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_rot_imm $I64X2
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 32))
;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2716,25 +2836,114 @@
(side_effect (vec_store_lane_little $F64X2 val
(lower_address flags addr offset) 0)))
;; Store 128-bit big-endian vector type.
;; Store 128-bit big-endian vector type, BE lane order - direct store.
(rule (lower (store flags @ (bigendian)
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.BigEndian) (lane_order))
(side_effect (vec_store val (lower_address flags addr offset))))
;; Store 128-bit little-endian vector type (z15 instruction).
;; Store 128-bit little-endian vector type, BE lane order - byte-reversed store.
(rule (lower (store flags @ (littleendian)
val @ (value_type (and (vr128_ty ty) (vxrs_ext2_enabled))) addr offset))
(side_effect (vec_storerev val (lower_address flags addr offset))))
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.BigEndian) (lane_order))
(side_effect (vec_store_byte_rev ty val flags addr offset)))
;; Store 128-bit little-endian vector type (via GPRs on z14).
;; Store 128-bit big-endian vector type, LE lane order - element-reversed store.
(rule (lower (store flags @ (bigendian)
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.LittleEndian) (lane_order))
(side_effect (vec_store_elt_rev ty val flags addr offset)))
;; Store 128-bit little-endian vector type, LE lane order - fully-reversed store.
(rule (lower (store flags @ (littleendian)
val @ (value_type (and (vr128_ty ty) (vxrs_ext2_disabled))) addr offset))
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.LittleEndian) (lane_order))
(side_effect (vec_store_full_rev ty val flags addr offset)))
;; Helper to perform a 128-bit full-vector byte-reversed store.
(decl vec_store_full_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Full-vector byte-reversed store via single instruction on z15.
(rule (vec_store_full_rev (vxrs_ext2_enabled) val flags addr offset)
(vec_storerev val (lower_address flags addr offset)))
;; Full-vector byte-reversed store via GPRs on z14.
(rule (vec_store_full_rev (vxrs_ext2_disabled) val flags addr offset)
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
(hi_addr MemArg (lower_address_bias flags addr offset 8))
(lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg)))
(hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg))))
(side_effect (side_effect_concat (storerev64 lo_val lo_addr)
(storerev64 hi_val hi_addr)))))
(side_effect_concat (storerev64 lo_val lo_addr)
(storerev64 hi_val hi_addr))))
;; Helper to perform an element-wise byte-reversed store.
(decl vec_store_byte_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Element-wise byte-reversed 1x128-bit store is a full byte-reversed store.
(rule (vec_store_byte_rev $I128 val flags addr offset)
(vec_store_full_rev $I128 val flags addr offset))
;; Element-wise byte-reversed 16x8-bit store is a direct store.
(rule (vec_store_byte_rev (multi_lane 8 16) val flags addr offset)
(vec_store val (lower_address flags addr offset)))
;; Element-wise byte-reversed store via single instruction on z15.
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_byte64rev val (lower_address flags addr offset)))
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_byte32rev val (lower_address flags addr offset)))
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_byte16rev val (lower_address flags addr offset)))
;; Element-wise byte-reversed load as element-swapped byte-reversed store on z14.
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
;; Helper to perform an element-reversed store.
(decl vec_store_elt_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Element-reversed 1x128-bit store is a direct store.
(rule (vec_store_elt_rev $I128 val flags addr offset)
(vec_store val (lower_address flags addr offset)))
;; Element-reversed 16x8-bit store is a full byte-reversed store.
(rule (vec_store_elt_rev ty @ (multi_lane 8 16) val flags addr offset)
(vec_store_full_rev ty val flags addr offset))
;; Element-reversed store via single instruction on z15.
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_elt64rev val (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_elt32rev val (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_elt16rev val (lower_address flags addr offset)))
;; Element-reversed store as element-swapped direct store on z14.
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3591,24 +3800,48 @@
;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56
64 72 80 88 96 104 112 120))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 120 112 104 96 88 80 72 64
56 48 40 32 24 16 8 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
0 16 32 48 64 80 96 112))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
112 96 80 64 48 32 16 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 0 32 64 96))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 96 64 32 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 128 128 0 64))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 128 128 64 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3870,7 +4103,8 @@
(decl lower_call_args_slots (ABISig Range ValueSlice) InstOutput)
(rule (lower_call_args_slots abi (range_empty) _) (output_none))
(rule (lower_call_args_slots abi (range_unwrap head tail) args)
(let ((_ Unit (copy_to_arg 0 (abi_get_arg abi head)
(let ((_ Unit (copy_to_arg (abi_lane_order abi)
0 (abi_get_arg abi head)
(value_slice_get args head))))
(lower_call_args_slots abi tail args)))
@@ -3886,7 +4120,9 @@
(decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput)
(rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder))
(rule (lower_call_rets abi (range_unwrap head tail) builder)
(let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head)))
(let ((ret ValueRegs (copy_from_arg (abi_lane_order abi)
(abi_sized_stack_arg_space abi)
(abi_get_ret abi head)))
(_ Unit (output_builder_push builder ret)))
(lower_call_rets abi tail builder)))

View File

@@ -6,8 +6,8 @@ pub mod generated_code;
// Types that the generated ISLE code uses via `use super::*`.
use crate::isa::s390x::abi::{S390xMachineDeps, REG_SAVE_AREA_SIZE};
use crate::isa::s390x::inst::{
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg,
MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, LaneOrder,
MemArg, MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted,
};
use crate::isa::s390x::settings::Flags as IsaFlags;
use crate::machinst::isle::*;
@@ -102,6 +102,10 @@ where
ABISig::from_func_sig::<S390xMachineDeps>(sig, self.flags).unwrap()
}
fn abi_lane_order(&mut self, abi: &ABISig) -> LaneOrder {
lane_order_for_call_conv(abi.call_conv())
}
fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit {
let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space();
self.lower_ctx
@@ -405,9 +409,36 @@ where
UImm16Shifted::maybe_from_u64(n)
}
#[inline]
fn lane_order(&mut self) -> Option<LaneOrder> {
Some(lane_order_for_call_conv(self.lower_ctx.abi().call_conv()))
}
#[inline]
fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 {
ty.lane_count() as u8 - 1 - idx
match self.lane_order().unwrap() {
LaneOrder::LittleEndian => ty.lane_count() as u8 - 1 - idx,
LaneOrder::BigEndian => idx,
}
}
#[inline]
fn be_vec_const(&mut self, ty: Type, n: u128) -> u128 {
match self.lane_order().unwrap() {
LaneOrder::LittleEndian => n,
LaneOrder::BigEndian => {
let lane_count = ty.lane_count();
let lane_bits = ty.lane_bits();
let lane_mask = (1u128 << lane_bits) - 1;
let mut n_le = n;
let mut n_be = 0u128;
for _ in 0..lane_count {
n_be = (n_be << lane_bits) | (n_le & lane_mask);
n_le = n_le >> lane_bits;
}
n_be
}
}
}
#[inline]
@@ -419,17 +450,19 @@ where
#[inline]
fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) {
let bytes = idx.to_be_bytes();
let bytes = match self.lane_order().unwrap() {
LaneOrder::LittleEndian => idx.to_be_bytes().map(|x| {
if x < 16 {
15 - x
} else if x < 32 {
47 - x
} else {
128
}
}),
LaneOrder::BigEndian => idx.to_le_bytes().map(|x| if x < 32 { x } else { 128 }),
};
let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16);
let bytes = bytes.map(|x| {
if x < 16 {
15 - x
} else if x < 32 {
47 - x
} else {
128
}
});
let permute_mask = u128::from_be_bytes(bytes);
(permute_mask, and_mask)
}
@@ -813,6 +846,16 @@ where
}
}
/// Lane order to be used for a given calling convention.
#[inline]
fn lane_order_for_call_conv(call_conv: CallConv) -> LaneOrder {
if call_conv.extends_wasmtime() {
LaneOrder::LittleEndian
} else {
LaneOrder::BigEndian
}
}
/// Zero-extend the low `from_bits` bits of `value` to a full u64.
#[inline]
fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 {