AArch64: port misc ops to ISLE. (#4796)

* Add some precise-output compile tests for aarch64.

* AArch64: port misc ops to ISLE.

- get_pinned_reg / set_pinned_reg
- bitcast
- stack_addr
- extractlane
- insertlane
- vhigh_bits
- iadd_ifcout
- fcvt_low_from_sint
This commit is contained in:
Chris Fallin
2022-08-29 12:56:39 -07:00
committed by GitHub
parent 6368c6b188
commit a6eb24bd4f
18 changed files with 1362 additions and 662 deletions

View File

@@ -2030,3 +2030,212 @@
;; N.B.: the Ret itself is generated by the ABI.
(rule (lower (return args))
(lower_return (range 0 (value_slice_len args)) args))
;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (get_pinned_reg))
(pinned_reg))
(rule (lower (set_pinned_reg val))
(side_effect (write_pinned_reg val)))
;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
(mov_from_vec src 0 (ScalarSize.Size32)))
(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
(mov_to_fpu src (ScalarSize.Size32)))
(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
(mov_from_vec src 0 (ScalarSize.Size64)))
(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
(mov_to_fpu src (ScalarSize.Size64)))
;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (raw_bitcast val))
val)
;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; extractlane with lane 0 can pass through the value unchanged; upper
;; bits are undefined when a narrower type is in a wider register.
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
val)
(rule (lower (has_type (ty_int_bool ty)
(extractlane val
(u8_from_uimm8 lane))))
(mov_from_vec val lane (scalar_size ty)))
(rule (lower (has_type (ty_scalar_float ty)
(extractlane val @ (value_type vty)
(u8_from_uimm8 lane))))
(fpu_move_from_vec val lane (vector_size vty)))
;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_int_bool _))
(u8_from_uimm8 lane)))
(mov_to_vec vec val lane (vector_size vty)))
(rule (lower (insertlane vec @ (value_type vty)
val @ (value_type (ty_scalar_float _))
(u8_from_uimm8 lane)))
(mov_vec_elem vec val lane 0 (vector_size vty)))
;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (copy x))
x)
;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (stack_addr stack_slot offset))
(compute_stack_addr stack_slot offset))
;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; All three sequences use one integer temporary and two vector
;; temporaries. The shift is done early so as to give the register
;; allocator the possibility of using the same reg for `tmp_v1` and
;; `src_v` in the case that this is the last use of `src_v`. See
;; https://github.com/WebAssembly/simd/pull/201 for the background and
;; derivation of these sequences. Alternative sequences are discussed
;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
;; although they are not used here.
(rule (lower (vhigh_bits vec @ (value_type $I8X16)))
(let (
;; Replicate the MSB of each of the 16 byte lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16)))
;; Bitwise-and with a mask
;; `0x80402010_08040201_80402010_08040201` to get the bit
;; in the proper location for each group of 8 lanes.
(anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
;; Produce a version of `anded` with upper 8 lanes and
;; lower 8 lanes swapped.
(anded_swapped Reg (vec_extract anded anded 8))
;; Zip together the two; with the above this produces the lane permutation:
;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
(zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
;; Add 16-bit lanes together ("add across vector"), so we
;; get, in the low 16 bits, 15+14+...+8 in the high byte
;; and 7+6+...+0 in the low byte. This effectively puts
;; the 16 MSBs together, giving our results.
;;
;; N.B.: `Size16x8` is not a typo!
(result Reg (addv zipped (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))
(rule (lower (vhigh_bits vec @ (value_type $I16X8)))
(let (
;; Replicate the MSB of each of the 8 16-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8)))
;; Bitwise-and with a mask
;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
;; bit in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
;; Add lanes together to get the 8 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size16x8))))
(mov_from_vec result 0 (ScalarSize.Size16))))
(rule (lower (vhigh_bits vec @ (value_type $I32X4)))
(let (
;; Replicate the MSB of each of the 4 32-bit lanes across
;; the whole lane (sshr is an arithmetic right shift).
(shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4)))
;; Bitwise-and with a mask
;; `0x00000008_00000004_00000002_00000001` to get the bit
;; in the proper location for each group of 4 lanes.
(anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
;; Add lanes together to get the 4 MSBs in the low byte.
(result Reg (addv anded (VectorSize.Size32x4))))
(mov_from_vec result 0 (ScalarSize.Size32))))
(rule (lower (vhigh_bits vec @ (value_type $I64X2)))
(let (
;; Grab the MSB out of each of the lanes, right-shift to
;; LSB, and add with a left-shift of upper lane's MSB back
;; to bit 1. the whole lane (sshr is an arithmetic right
;; shift).
(upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
(lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
(upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
(lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
(add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))
;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This is a two-output instruction that is needed for the
;; legalizer's explicit heap-check sequence, among possible other
;; uses. Its second output is a flags output only ever meant to
;; check for overflow using the
;; `backend.unsigned_add_overflow_condition()` condition.
;;
;; Note that the CLIF validation will ensure that no flag-setting
;; operation comes between this IaddIfcout and its use (e.g., a
;; Trapif). Thus, we can rely on implicit communication through the
;; processor flags rather than explicitly generating flags into a
;; register. We simply use the variant of the add instruction that
;; sets flags (`adds`) here.
;;
;; Note that the second output (the flags) need not be generated,
;; because flags are never materialized into a register; the only
;; instructions that can use a value of type `iflags` or `fflags`
;; will look directly for the flags-producing instruction (which can
;; always be found, by construction) and merge it.
;;
;; Now handle the iadd as above, except use an AddS opcode that sets
;; flags.
(rule (lower (has_type (ty_int ty)
(iadd_ifcout a b)))
(output_pair
(add_with_flags ty a b)
(invalid_reg)))
;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.
;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F64X2 (fcvt_low_from_sint val)))
(let ((extended Reg (vec_extend (VecExtendOp.Sxtl) val $false (ScalarSize.Size64)))
(converted Reg (vec_misc (VecMisc2.Scvtf) extended (VectorSize.Size64x2))))
converted))
;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (fvpromote_low val))
(vec_rr_long (VecRRLongOp.Fcvtl32) val $false))
;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `selectif` / `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `trueif` / `trueff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `brz`/`brnz`/`brif`/`brff`/`bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO: requires icmp/fcmp first.
;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.
;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; TODO.