s390x: Support both big- and little-endian vector lane order (#4682)

This implements the s390x back-end portion of the solution for
https://github.com/bytecodealliance/wasmtime/issues/4566

We now support both big- and little-endian vector lane order
in code generation.  The order used for a function is determined
by the function's ABI: if it uses a Wasmtime ABI, it will use
little-endian lane order, and big-endian lane order otherwise.
(This ensures that all raw_bitcast instructions generated by
both wasmtime and other cranelift frontends can always be
implemented as a no-op.)

Lane order affects the implementation of a number of operations:
- Vector immediates
- Vector memory load / store (in big- and little-endian variants)
- Operations explicitly using lane numbers
  (insertlane, extractlane, shuffle, swizzle)
- Operations implicitly using lane numbers
  (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits)

In addition, when calling a function using a different lane order,
we need to lane-swap all vector values passed or returned in registers.

A small number of changes to common code were also needed:

- Ensure we always select a Wasmtime calling convention on s390x
  in crates/cranelift (func_signature).

- Fix vector immediates for filetests/runtests.  In PR #4427,
  I attempted to fix this by byte-swapping the V128 value, but
  with the new scheme, we'd instead need to perform a per-lane
  byte swap.  Since we do not know the actual type in write_to_slice
  and read_from_slice, this isn't easily possible.

  Revert this part of PR #4427 again, and instead just mark the
  memory buffer as little-endian when emitting the trampoline;
  the back-end will then emit correct code to load the constant.

- Change a runtest in simd-bitselect-to-vselect.clif to no longer
  make little-endian lane order assumptions.

- Remove runtests in simd-swizzle.clif that make little-endian
  lane order assumptions by relying on implicit type conversion
  when using a non-i16x8 swizzle result type (this feature should
  probably be removed anyway).

Tested with both wasmtime and cg_clif.
This commit is contained in:
Ulrich Weigand
2022-08-11 21:10:46 +02:00
committed by GitHub
parent c1c48b4386
commit 67870d1518
29 changed files with 6584 additions and 593 deletions

View File

@@ -39,7 +39,7 @@
;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty (vconst (u128_from_constant x))))
(vec_imm ty x))
(vec_imm ty (be_vec_const ty x)))
;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -148,9 +148,9 @@
;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers.
(rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y)))
(let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits))))
(vec_pack (vec_widen_type ty)
(vec_add ty y (vec_lshr_by_byte y size))
(vec_add ty x (vec_lshr_by_byte x size)))))
(vec_pack_lane_order (vec_widen_type ty)
(vec_add ty x (vec_lshr_by_byte x size))
(vec_add ty y (vec_lshr_by_byte y size)))))
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -929,9 +929,14 @@
(put_in_reg_zext64 x))
;; 128-bit target types.
(rule (lower (has_type (vr128_ty _ty) (uextend x @ (value_type src_ty))))
(let ((ty Type (ty_vec128_from_lane_ty src_ty)))
(vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg))))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I8))))
(vec_insert_lane $I8X16 (vec_imm ty 0) x 15 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I16))))
(vec_insert_lane $I16X8 (vec_imm ty 0) x 7 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I32))))
(vec_insert_lane $I32X4 (vec_imm ty 0) x 3 (zero_reg)))
(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I64))))
(vec_insert_lane $I64X2 (vec_imm ty 0) x 1 (zero_reg)))
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -953,44 +958,44 @@
;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y))
(vec_pack_ssat ty y x))
(vec_pack_ssat_lane_order ty x y))
;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y))
(vec_pack_usat ty y x))
(vec_pack_usat_lane_order ty x y))
;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y))
(let ((zero Reg (vec_imm ty 0)))
(vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero))))
(vec_pack_usat_lane_order ty (vec_smax ty x zero) (vec_smax ty y zero))))
;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (swiden_low x @ (value_type (ty_vec128 ty))))
(vec_unpacks_low ty x))
(vec_unpacks_low_lane_order ty x))
;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (swiden_high x @ (value_type (ty_vec128 ty))))
(vec_unpacks_high ty x))
(vec_unpacks_high_lane_order ty x))
;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uwiden_low x @ (value_type (ty_vec128 ty))))
(vec_unpacku_low ty x))
(vec_unpacku_low_lane_order ty x))
;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (uwiden_high x @ (value_type (ty_vec128 ty))))
(vec_unpacku_high ty x))
(vec_unpacku_high_lane_order ty x))
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1583,7 +1588,7 @@
;; Promote a register.
(rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4))))
(fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x)))
(fpromote_reg $F64X2 $F32X4 (vec_merge_low_lane_order $I32X4 x x)))
;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1598,9 +1603,8 @@
;; Demote a register.
(rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2))))
(let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x)))
(vec_permute $F32X4 dst (vec_imm $F32X4 0)
(vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16
0 1 2 3 8 9 10 11)))))
(vec_pack_lane_order $I64X2 (vec_lshr_imm $I64X2 dst 32)
(vec_imm $I64X2 0))))
;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1690,7 +1694,7 @@
;; Convert the low half of a $I32X4 to a $F64X2.
(rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4))))
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven)
(vec_unpacks_low $I32X4 x)))
(vec_unpacks_low_lane_order $I32X4 x)))
;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1845,7 +1849,12 @@
;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Raw bitcast is always a no-op.
;; FIXME: There are two flavors of raw_bitcast, which are currently not
;; distinguished in CLIF IR. Those generated by Wasmtime assume little-endian
;; lane order, and those generated elsewhere assume big-endian lane order.
;; Raw bitcast is a no-op if current lane order matches that assumed lane order.
;; However, due to our choice of lane order depending on the current function
;; ABI, every bitcast we currently see here is indeed a no-op.
(rule (lower (raw_bitcast x)) x)
@@ -2352,9 +2361,20 @@
;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; We need to modify the lane mask at runtime in two ways:
;; - convert from little-endian to big-endian lane numbering
;; - handle mask elements outside the range 0..15 by zeroing the lane
;; When using big-endian lane order, the lane mask is mostly correct, but we
;; need to handle mask elements outside the range 0..15 by zeroing the lane.
;;
;; To do so efficiently, we compute:
;; permute-lane-element := umin (16, swizzle-lane-element)
;; and pass a zero vector as second operand to the permute instruction.
(rule (lower (has_type (ty_vec128 ty) (swizzle x y)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_permute ty x (vec_imm ty 0)
(vec_umin $I8X16 (vec_imm_splat $I8X16 16) y)))
;; When using little-endian lane order, in addition to zeroing (as above),
;; we need to convert from little-endian to big-endian lane numbering.
;;
;; To do so efficiently, we compute:
;; permute-lane-element := umax (239, ~ swizzle-lane-element)
@@ -2368,6 +2388,7 @@
;; to implement the required swizzle semantics.
(rule (lower (has_type (ty_vec128 ty) (swizzle x y)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_permute ty (vec_imm ty 0) x
(vec_umax $I8X16 (vec_imm_splat $I8X16 239)
(vec_not $I8X16 y))))
@@ -2485,18 +2506,36 @@
(rule (lower (has_type $F64 (load flags @ (littleendian) addr offset)))
(vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0))
;; Load 128-bit big-endian vector values.
;; Load 128-bit big-endian vector values, BE lane order - direct load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load ty (lower_address flags addr offset)))
;; Load 128-bit little-endian vector values (z15 instruction).
(rule (lower (has_type (and (vxrs_ext2_enabled) (vr128_ty ty))
(load flags @ (littleendian) addr offset)))
;; Load 128-bit little-endian vector values, BE lane order - byte-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset)))
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_byte_rev ty flags addr offset))
;; Load 128-bit big-endian vector values, LE lane order - element-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_elt_rev ty flags addr offset))
;; Load 128-bit little-endian vector values, LE lane order - fully-reversed load.
(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset)))
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_full_rev ty flags addr offset))
;; Helper to perform a 128-bit full-vector byte-reversed load.
(decl vec_load_full_rev (Type MemFlags Value Offset32) Reg)
;; Full-vector byte-reversed load via single instruction on z15.
(rule (vec_load_full_rev (and (vxrs_ext2_enabled) (vr128_ty ty)) flags addr offset)
(vec_loadrev ty (lower_address flags addr offset)))
;; Load 128-bit little-endian vector values (via GPRs on z14).
(rule (lower (has_type (and (vxrs_ext2_disabled) (vr128_ty ty))
(load flags @ (littleendian) addr offset)))
;; Full-vector byte-reversed load via GPRs on z14.
(rule (vec_load_full_rev (and (vxrs_ext2_disabled) (vr128_ty ty)) flags addr offset)
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
(hi_addr MemArg (lower_address_bias flags addr offset 8))
(lo_val Reg (loadrev64 lo_addr))
@@ -2504,6 +2543,75 @@
(mov_to_vec128 ty hi_val lo_val)))
;; Helper to perform an element-wise byte-reversed load.
(decl vec_load_byte_rev (Type MemFlags Value Offset32) Reg)
;; Element-wise byte-reversed 1x128-bit load is a full byte-reversed load.
(rule (vec_load_byte_rev $I128 flags addr offset)
(vec_load_full_rev $I128 flags addr offset))
;; Element-wise byte-reversed 16x8-bit load is a direct load.
(rule (vec_load_byte_rev ty @ (multi_lane 8 16) flags addr offset)
(vec_load ty (lower_address flags addr offset)))
;; Element-wise byte-reversed load via single instruction on z15.
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_load_byte64rev ty (lower_address flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_load_byte32rev ty (lower_address flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_load_byte16rev ty (lower_address flags addr offset)))
;; Element-wise byte-reversed load as element-swapped byte-reversed load on z14.
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_elt_rev ty (vec_load_full_rev ty flags addr offset)))
;; Helper to perform an element-reversed load.
(decl vec_load_elt_rev (Type MemFlags Value Offset32) Reg)
;; Element-reversed 1x128-bit load is a direct load.
;; For 1x128-bit types, this is a direct load.
(rule (vec_load_elt_rev $I128 flags addr offset)
(vec_load $I128 (lower_address flags addr offset)))
;; Element-reversed 16x8-bit load is a full byte-reversed load.
(rule (vec_load_elt_rev ty @ (multi_lane 8 16) flags addr offset)
(vec_load_full_rev ty flags addr offset))
;; Element-reversed load via single instruction on z15.
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_load_elt64rev ty (lower_address flags addr offset)))
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_load_elt32rev ty (lower_address flags addr offset)))
(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_load_elt16rev ty (lower_address flags addr offset)))
;; Element-reversed load as element-swapped direct load on z14.
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
flags addr offset)
(vec_elt_rev ty (vec_load ty (lower_address flags addr offset))))
;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; 16- or 32-bit target types.
@@ -2606,65 +2714,77 @@
;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Unsigned 8->16 bit extension, big-endian source value.
(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I8X16
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 8->16 bit extension.
(rule (lower (has_type $I16X8 (uload8x8 flags addr offset)))
(vec_unpacku_high $I8X16 (load_v64 $I8X16 flags addr offset)))
;; Unsigned 8->16 bit extension, little-endian source value.
(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I8X16
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 8->16 bit extension.
(rule (lower (has_type $I16X8 (sload8x8 flags addr offset)))
(vec_unpacks_high $I8X16 (load_v64 $I8X16 flags addr offset)))
;; Signed 8->16 bit extension, big-endian source value.
(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I8X16
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 16->32 bit extension.
(rule (lower (has_type $I32X4 (uload16x4 flags addr offset)))
(vec_unpacku_high $I16X8 (load_v64 $I16X8 flags addr offset)))
;; Signed 8->16 bit extension, little-endian source value.
(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I8X16
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 16->32 bit extension.
(rule (lower (has_type $I32X4 (sload16x4 flags addr offset)))
(vec_unpacks_high $I16X8 (load_v64 $I16X8 flags addr offset)))
;; Unsigned 16->32 bit extension, big-endian source value.
(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Unsigned 32->64 bit extension.
(rule (lower (has_type $I64X2 (uload32x2 flags addr offset)))
(vec_unpacku_high $I32X4 (load_v64 $I32X4 flags addr offset)))
;; Unsigned 16->32 bit extension, little-endian source value.
(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 32->64 bit extension.
(rule (lower (has_type $I64X2 (sload32x2 flags addr offset)))
(vec_unpacks_high $I32X4 (load_v64 $I32X4 flags addr offset)))
;; Signed 16->32 bit extension, big-endian source value.
(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Signed 16->32 bit extension, little-endian source value.
(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Helper to load a 64-bit half-size vector from memory.
(decl load_v64 (Type MemFlags Value Offset32) Reg)
;; Unsigned 32->64 bit extension, big-endian source value.
(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset)))
(vec_unpacku_high $I32X4
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Any big-endian source value, BE lane order.
(rule (load_v64 _ flags @ (bigendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))
;; Unsigned 32->64 bit extension, little-endian source value.
(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset)))
(vec_unpacku_high $I32X4
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Any little-endian source value, LE lane order.
(rule (load_v64 _ flags @ (littleendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))
;; Signed 32->64 bit extension, big-endian source value.
(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset)))
(vec_unpacks_high $I32X4
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
;; Big-endian or little-endian 8x8-bit source value, BE lane order.
(rule (load_v64 (multi_lane 8 16) flags addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))
;; Signed 32->64 bit extension, little-endian source value.
(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset)))
(vec_unpacks_high $I32X4
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
;; Big-endian or little-endian 8x8-bit source value, LE lane order.
(rule (load_v64 (multi_lane 8 16) flags addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))
;; Little-endian 4x16-bit source value, BE lane order.
(rule (load_v64 (multi_lane 16 8) flags @ (littleendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_rot_imm $I16X8
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 8))
;; Big-endian 4x16-bit source value, LE lane order.
(rule (load_v64 (multi_lane 16 8) flags @ (bigendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_rot_imm $I16X8
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 8))
;; Little-endian 2x32-bit source value, BE lane order.
(rule (load_v64 (multi_lane 32 4) flags @ (littleendian) addr offset)
(if-let (LaneOrder.BigEndian) (lane_order))
(vec_rot_imm $I64X2
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 32))
;; Big-endian 2x32-bit source value, LE lane order.
(rule (load_v64 (multi_lane 32 4) flags @ (bigendian) addr offset)
(if-let (LaneOrder.LittleEndian) (lane_order))
(vec_rot_imm $I64X2
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 32))
;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2716,25 +2836,114 @@
(side_effect (vec_store_lane_little $F64X2 val
(lower_address flags addr offset) 0)))
;; Store 128-bit big-endian vector type.
;; Store 128-bit big-endian vector type, BE lane order - direct store.
(rule (lower (store flags @ (bigendian)
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.BigEndian) (lane_order))
(side_effect (vec_store val (lower_address flags addr offset))))
;; Store 128-bit little-endian vector type (z15 instruction).
;; Store 128-bit little-endian vector type, BE lane order - byte-reversed store.
(rule (lower (store flags @ (littleendian)
val @ (value_type (and (vr128_ty ty) (vxrs_ext2_enabled))) addr offset))
(side_effect (vec_storerev val (lower_address flags addr offset))))
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.BigEndian) (lane_order))
(side_effect (vec_store_byte_rev ty val flags addr offset)))
;; Store 128-bit little-endian vector type (via GPRs on z14).
;; Store 128-bit big-endian vector type, LE lane order - element-reversed store.
(rule (lower (store flags @ (bigendian)
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.LittleEndian) (lane_order))
(side_effect (vec_store_elt_rev ty val flags addr offset)))
;; Store 128-bit little-endian vector type, LE lane order - fully-reversed store.
(rule (lower (store flags @ (littleendian)
val @ (value_type (and (vr128_ty ty) (vxrs_ext2_disabled))) addr offset))
val @ (value_type (vr128_ty ty)) addr offset))
(if-let (LaneOrder.LittleEndian) (lane_order))
(side_effect (vec_store_full_rev ty val flags addr offset)))
;; Helper to perform a 128-bit full-vector byte-reversed store.
(decl vec_store_full_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Full-vector byte-reversed store via single instruction on z15.
(rule (vec_store_full_rev (vxrs_ext2_enabled) val flags addr offset)
(vec_storerev val (lower_address flags addr offset)))
;; Full-vector byte-reversed store via GPRs on z14.
(rule (vec_store_full_rev (vxrs_ext2_disabled) val flags addr offset)
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
(hi_addr MemArg (lower_address_bias flags addr offset 8))
(lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg)))
(hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg))))
(side_effect (side_effect_concat (storerev64 lo_val lo_addr)
(storerev64 hi_val hi_addr)))))
(side_effect_concat (storerev64 lo_val lo_addr)
(storerev64 hi_val hi_addr))))
;; Helper to perform an element-wise byte-reversed store.
(decl vec_store_byte_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Element-wise byte-reversed 1x128-bit store is a full byte-reversed store.
(rule (vec_store_byte_rev $I128 val flags addr offset)
(vec_store_full_rev $I128 val flags addr offset))
;; Element-wise byte-reversed 16x8-bit store is a direct store.
(rule (vec_store_byte_rev (multi_lane 8 16) val flags addr offset)
(vec_store val (lower_address flags addr offset)))
;; Element-wise byte-reversed store via single instruction on z15.
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_byte64rev val (lower_address flags addr offset)))
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_byte32rev val (lower_address flags addr offset)))
(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_byte16rev val (lower_address flags addr offset)))
;; Element-wise byte-reversed load as element-swapped byte-reversed store on z14.
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset))
;; Helper to perform an element-reversed store.
(decl vec_store_elt_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult)
;; Element-reversed 1x128-bit store is a direct store.
(rule (vec_store_elt_rev $I128 val flags addr offset)
(vec_store val (lower_address flags addr offset)))
;; Element-reversed 16x8-bit store is a full byte-reversed store.
(rule (vec_store_elt_rev ty @ (multi_lane 8 16) val flags addr offset)
(vec_store_full_rev ty val flags addr offset))
;; Element-reversed store via single instruction on z15.
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store_elt64rev val (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store_elt32rev val (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store_elt16rev val (lower_address flags addr offset)))
;; Element-reversed store as element-swapped direct store on z14.
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8))
val flags addr offset)
(vec_store (vec_elt_rev ty val) (lower_address flags addr offset)))
;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3591,24 +3800,48 @@
;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56
64 72 80 88 96 104 112 120))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 120 112 104 96 88 80 72 64
56 48 40 32 24 16 8 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
0 16 32 48 64 80 96 112))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
112 96 80 64 48 32 16 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 0 32 64 96))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 96 64 32 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2))))
(if-let (LaneOrder.LittleEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 128 128 0 64))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2))))
(if-let (LaneOrder.BigEndian) (lane_order))
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
128 128 128 128 128 128 64 0))))
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3870,7 +4103,8 @@
(decl lower_call_args_slots (ABISig Range ValueSlice) InstOutput)
(rule (lower_call_args_slots abi (range_empty) _) (output_none))
(rule (lower_call_args_slots abi (range_unwrap head tail) args)
(let ((_ Unit (copy_to_arg 0 (abi_get_arg abi head)
(let ((_ Unit (copy_to_arg (abi_lane_order abi)
0 (abi_get_arg abi head)
(value_slice_get args head))))
(lower_call_args_slots abi tail args)))
@@ -3886,7 +4120,9 @@
(decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput)
(rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder))
(rule (lower_call_rets abi (range_unwrap head tail) builder)
(let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head)))
(let ((ret ValueRegs (copy_from_arg (abi_lane_order abi)
(abi_sized_stack_arg_space abi)
(abi_get_ret abi head)))
(_ Unit (output_builder_push builder ret)))
(lower_call_rets abi tail builder)))