* Support shadowing in isle * Re-run the isle build.rs if the examples change * Print error messages when isle tests fail * Move run tests * Refactor `let` uses that don't need to introduce unique names
3622 lines
161 KiB
Common Lisp
3622 lines
161 KiB
Common Lisp
;; s390x instruction selection and CLIF-to-MachInst lowering.
|
|
|
|
;; The main lowering constructor term: takes a clif `Inst` and returns the
|
|
;; register(s) within which the lowered instruction's result values live.
|
|
(decl lower (Inst) InstOutput)
|
|
|
|
;; A variant of the main lowering constructor term, used for branches.
|
|
;; The only difference is that it gets an extra argument holding a vector
|
|
;; of branch targets to be used.
|
|
(decl lower_branch (Inst VecMachLabel) InstOutput)
|
|
|
|
|
|
;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (iconst (u64_from_imm64 n))))
|
|
(imm ty n))
|
|
|
|
|
|
;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (bconst $false)))
|
|
(imm ty 0))
|
|
(rule (lower (has_type ty (bconst $true)))
|
|
(imm ty 1))
|
|
|
|
|
|
;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (f32const (u64_from_ieee32 x)))
|
|
(imm $F32 x))
|
|
|
|
|
|
;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (f64const (u64_from_ieee64 x)))
|
|
(imm $F64 x))
|
|
|
|
|
|
;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (vconst (u128_from_constant x))))
|
|
(vec_imm ty x))
|
|
|
|
|
|
;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (null)))
|
|
(imm ty 0))
|
|
|
|
|
|
;;;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (nop))
|
|
(invalid_reg))
|
|
|
|
|
|
;;;; Rules for `copy` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (copy x))
|
|
x)
|
|
|
|
|
|
;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x y)))
|
|
(add_reg ty x y))
|
|
|
|
;; Add a register and a sign-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (sext32_value y))))
|
|
(add_reg_sext32 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (sext32_value x) y)))
|
|
(add_reg_sext32 ty y x))
|
|
|
|
;; Add a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (i16_from_value y))))
|
|
(add_simm16 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (i16_from_value x) y)))
|
|
(add_simm16 ty y x))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (i32_from_value y))))
|
|
(add_simm32 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (i32_from_value x) y)))
|
|
(add_simm32 ty y x))
|
|
|
|
;; Add a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_load_32_64 y))))
|
|
(add_mem ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_load_32_64 x) y)))
|
|
(add_mem ty y (sink_load x)))
|
|
|
|
;; Add a register and memory (16-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_load_16 y))))
|
|
(add_mem_sext16 ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_load_16 x) y)))
|
|
(add_mem_sext16 ty y (sink_load x)))
|
|
|
|
;; Add a register and sign-extended memory.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_sload16 y))))
|
|
(add_mem_sext16 ty x (sink_sload16 y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_sload16 x) y)))
|
|
(add_mem_sext16 ty y (sink_sload16 x)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_sload32 y))))
|
|
(add_mem_sext32 ty x (sink_sload32 y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_sload32 x) y)))
|
|
(add_mem_sext32 ty y (sink_sload32 x)))
|
|
|
|
;; Add two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (iadd x y)))
|
|
(vec_add ty x y))
|
|
|
|
|
|
;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add (saturate unsigned) two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (uadd_sat x y)))
|
|
(let ((sum Reg (vec_add ty x y)))
|
|
(vec_or ty sum (vec_cmphl ty x sum))))
|
|
|
|
|
|
;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add (saturate signed) two vector registers. $I64X2 not supported.
|
|
(rule (lower (has_type (ty_vec128 ty) (sadd_sat x y)))
|
|
(vec_pack_ssat (vec_widen_type ty)
|
|
(vec_add (vec_widen_type ty) (vec_unpacks_high ty x)
|
|
(vec_unpacks_high ty y))
|
|
(vec_add (vec_widen_type ty) (vec_unpacks_low ty x)
|
|
(vec_unpacks_low ty y))))
|
|
|
|
|
|
;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers.
|
|
(rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y)))
|
|
(let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits))))
|
|
(vec_pack (vec_widen_type ty)
|
|
(vec_add ty y (vec_lshr_by_byte y size))
|
|
(vec_add ty x (vec_lshr_by_byte x size)))))
|
|
|
|
|
|
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Sub two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x y)))
|
|
(sub_reg ty x y))
|
|
|
|
;; Sub a register and a sign-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (sext32_value y))))
|
|
(sub_reg_sext32 ty x y))
|
|
|
|
;; Sub a register and an immediate (using add of the negated value).
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (i16_from_negated_value y))))
|
|
(add_simm16 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (i32_from_negated_value y))))
|
|
(add_simm32 ty x y))
|
|
|
|
;; Sub a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_load_32_64 y))))
|
|
(sub_mem ty x (sink_load y)))
|
|
|
|
;; Sub a register and memory (16-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_load_16 y))))
|
|
(sub_mem_sext16 ty x (sink_load y)))
|
|
|
|
;; Sub a register and sign-extended memory.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_sload16 y))))
|
|
(sub_mem_sext16 ty x (sink_sload16 y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_sload32 y))))
|
|
(sub_mem_sext32 ty x (sink_sload32 y)))
|
|
|
|
;; Sub two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (isub x y)))
|
|
(vec_sub ty x y))
|
|
|
|
|
|
;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add (saturate unsigned) two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (usub_sat x y)))
|
|
(vec_and ty (vec_sub ty x y) (vec_cmphl ty x y)))
|
|
|
|
|
|
;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add (saturate signed) two vector registers. $I64X2 not supported.
|
|
(rule (lower (has_type (ty_vec128 ty) (ssub_sat x y)))
|
|
(vec_pack_ssat (vec_widen_type ty)
|
|
(vec_sub (vec_widen_type ty) (vec_unpacks_high ty x)
|
|
(vec_unpacks_high ty y))
|
|
(vec_sub (vec_widen_type ty) (vec_unpacks_low ty x)
|
|
(vec_unpacks_low ty y))))
|
|
|
|
|
|
;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Absolute value of a register.
|
|
;; For types smaller than 32-bit, the input value must be sign-extended.
|
|
(rule (lower (has_type (fits_in_64 ty) (iabs x)))
|
|
(abs_reg (ty_ext32 ty) (put_in_reg_sext32 x)))
|
|
|
|
;; Absolute value of a sign-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (iabs (sext32_value x))))
|
|
(abs_reg_sext32 ty x))
|
|
|
|
;; Absolute value of a vector register.
|
|
(rule (lower (has_type (ty_vec128 ty) (iabs x)))
|
|
(vec_abs ty x))
|
|
|
|
|
|
;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; N.B.: the second output of `iadd_ifcout` is meant to be the `iflags` value
|
|
;; containing the carry result, but we do not support the `iflags` mechanism.
|
|
;; However, the only actual use case is where `iadd_ifcout` feeds into `trapif`,
|
|
;; which is implemented by explicitly matching on the flags producer. So we can
|
|
;; get away with just using an invalid second output, and the reg-renaming code
|
|
;; does the right thing, for now.
|
|
(decl output_ifcout (Reg) InstOutput)
|
|
(rule (output_ifcout reg)
|
|
(output_pair reg (value_regs_invalid)))
|
|
|
|
;; Add two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x y)))
|
|
(output_ifcout (add_logical_reg ty x y)))
|
|
|
|
;; Add a register and a zero-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (zext32_value y))))
|
|
(output_ifcout (add_logical_reg_zext32 ty x y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (zext32_value x) y)))
|
|
(output_ifcout (add_logical_reg_zext32 ty y x)))
|
|
|
|
;; Add a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (u32_from_value y))))
|
|
(output_ifcout (add_logical_zimm32 ty x y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (u32_from_value x) y)))
|
|
(output_ifcout (add_logical_zimm32 ty y x)))
|
|
|
|
;; Add a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (sinkable_load_32_64 y))))
|
|
(output_ifcout (add_logical_mem ty x (sink_load y))))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (sinkable_load_32_64 x) y)))
|
|
(output_ifcout (add_logical_mem ty y (sink_load x))))
|
|
|
|
;; Add a register and zero-extended memory.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (sinkable_uload32 y))))
|
|
(output_ifcout (add_logical_mem_zext32 ty x (sink_uload32 y))))
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (sinkable_uload32 x) y)))
|
|
(output_ifcout (add_logical_mem_zext32 ty y (sink_uload32 x))))
|
|
|
|
|
|
;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Negate a register.
|
|
(rule (lower (has_type (fits_in_64 ty) (ineg x)))
|
|
(neg_reg ty x))
|
|
|
|
;; Negate a sign-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (ineg (sext32_value x))))
|
|
(neg_reg_sext32 ty x))
|
|
|
|
;; Negate a vector register.
|
|
(rule (lower (has_type (ty_vec128 ty) (ineg x)))
|
|
(vec_neg ty x))
|
|
|
|
|
|
;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Unsigned maximum of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (umax x y)))
|
|
(vec_umax ty x y))
|
|
|
|
|
|
;;;; Rules for `umin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Unsigned minimum of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (umin x y)))
|
|
(vec_umin ty x y))
|
|
|
|
|
|
;;;; Rules for `imax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Signed maximum of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (imax x y)))
|
|
(vec_smax ty x y))
|
|
|
|
|
|
;;;; Rules for `imin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Signed minimum of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (imin x y)))
|
|
(vec_smin ty x y))
|
|
|
|
|
|
;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Unsigned average of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (avg_round x y)))
|
|
(vec_uavg ty x y))
|
|
|
|
|
|
;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Multiply two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x y)))
|
|
(mul_reg ty x y))
|
|
|
|
;; Multiply a register and a sign-extended register.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (sext32_value y))))
|
|
(mul_reg_sext32 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (sext32_value x) y)))
|
|
(mul_reg_sext32 ty y x))
|
|
|
|
;; Multiply a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (i16_from_value y))))
|
|
(mul_simm16 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (i16_from_value x) y)))
|
|
(mul_simm16 ty y x))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (i32_from_value y))))
|
|
(mul_simm32 ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (i32_from_value x) y)))
|
|
(mul_simm32 ty y x))
|
|
|
|
;; Multiply a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_load_32_64 y))))
|
|
(mul_mem ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (sinkable_load_32_64 x) y)))
|
|
(mul_mem ty y (sink_load x)))
|
|
|
|
;; Multiply a register and memory (16-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_load_16 y))))
|
|
(mul_mem_sext16 ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (sinkable_load_16 x) y)))
|
|
(mul_mem_sext16 ty y (sink_load x)))
|
|
|
|
;; Multiply a register and sign-extended memory.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_sload16 y))))
|
|
(mul_mem_sext16 ty x (sink_sload16 y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (sinkable_sload16 x) y)))
|
|
(mul_mem_sext16 ty y (sink_sload16 x)))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_sload32 y))))
|
|
(mul_mem_sext32 ty x (sink_sload32 y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (imul (sinkable_sload32 x) y)))
|
|
(mul_mem_sext32 ty y (sink_sload32 x)))
|
|
|
|
;; Multiply two vector registers, using a helper.
|
|
(decl vec_mul_impl (Type Reg Reg) Reg)
|
|
(rule (lower (has_type (ty_vec128 ty) (imul x y)))
|
|
(vec_mul_impl ty x y))
|
|
|
|
;; Multiply two vector registers - byte, halfword, and word.
|
|
(rule (vec_mul_impl $I8X16 x y) (vec_mul $I8X16 x y))
|
|
(rule (vec_mul_impl $I16X8 x y) (vec_mul $I16X8 x y))
|
|
(rule (vec_mul_impl $I32X4 x y) (vec_mul $I32X4 x y))
|
|
|
|
;; Multiply two vector registers - doubleword. Has to be scalarized.
|
|
(rule (vec_mul_impl $I64X2 x y)
|
|
(mov_to_vec128 $I64X2
|
|
(mul_reg $I64 (vec_extract_lane $I64X2 x 0 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 0 (zero_reg)))
|
|
(mul_reg $I64 (vec_extract_lane $I64X2 x 1 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 1 (zero_reg)))))
|
|
|
|
|
|
;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Multiply high part unsigned, 8-bit or 16-bit types. (Uses 32-bit multiply.)
|
|
(rule (lower (has_type (ty_8_or_16 ty) (umulhi x y)))
|
|
(let ((ext_reg_x Reg (put_in_reg_zext32 x))
|
|
(ext_reg_y Reg (put_in_reg_zext32 y))
|
|
(ext_mul Reg (mul_reg $I32 ext_reg_x ext_reg_y)))
|
|
(lshr_imm $I32 ext_mul (ty_bits ty))))
|
|
|
|
;; Multiply high part unsigned, 32-bit types. (Uses 64-bit multiply.)
|
|
(rule (lower (has_type $I32 (umulhi x y)))
|
|
(let ((ext_reg_x Reg (put_in_reg_zext64 x))
|
|
(ext_reg_y Reg (put_in_reg_zext64 y))
|
|
(ext_mul Reg (mul_reg $I64 ext_reg_x ext_reg_y)))
|
|
(lshr_imm $I64 ext_mul 32)))
|
|
|
|
;; Multiply high part unsigned, 64-bit types. (Uses umul_wide.)
|
|
(rule (lower (has_type $I64 (umulhi x y)))
|
|
(let ((pair RegPair (umul_wide x y)))
|
|
(copy_reg $I64 (regpair_hi pair))))
|
|
|
|
;; Multiply high part unsigned, vector types with 8-, 16-, or 32-bit elements.
|
|
(rule (lower (has_type $I8X16 (umulhi x y))) (vec_umulhi $I8X16 x y))
|
|
(rule (lower (has_type $I16X8 (umulhi x y))) (vec_umulhi $I16X8 x y))
|
|
(rule (lower (has_type $I32X4 (umulhi x y))) (vec_umulhi $I32X4 x y))
|
|
|
|
;; Multiply high part unsigned, vector types with 64-bit elements.
|
|
;; Has to be scalarized.
|
|
(rule (lower (has_type $I64X2 (umulhi x y)))
|
|
(let ((pair_0 RegPair (umul_wide (vec_extract_lane $I64X2 x 0 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 0 (zero_reg))))
|
|
(res_0 Reg (copy_reg $I64 (regpair_hi pair_0)))
|
|
(pair_1 RegPair (umul_wide (vec_extract_lane $I64X2 x 1 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 1 (zero_reg))))
|
|
(res_1 Reg (copy_reg $I64 (regpair_hi pair_1))))
|
|
(mov_to_vec128 $I64X2 res_0 res_1)))
|
|
|
|
|
|
;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Multiply high part signed, 8-bit or 16-bit types. (Uses 32-bit multiply.)
|
|
(rule (lower (has_type (ty_8_or_16 ty) (smulhi x y)))
|
|
(let ((ext_reg_x Reg (put_in_reg_sext32 x))
|
|
(ext_reg_y Reg (put_in_reg_sext32 y))
|
|
(ext_mul Reg (mul_reg $I32 ext_reg_x ext_reg_y)))
|
|
(ashr_imm $I32 ext_mul (ty_bits ty))))
|
|
|
|
;; Multiply high part signed, 32-bit types. (Uses 64-bit multiply.)
|
|
(rule (lower (has_type $I32 (smulhi x y)))
|
|
(let ((ext_reg_x Reg (put_in_reg_sext64 x))
|
|
(ext_reg_y Reg (put_in_reg_sext64 y))
|
|
(ext_mul Reg (mul_reg $I64 ext_reg_x ext_reg_y)))
|
|
(ashr_imm $I64 ext_mul 32)))
|
|
|
|
;; Multiply high part signed, 64-bit types. (Uses smul_wide.)
|
|
(rule (lower (has_type $I64 (smulhi x y)))
|
|
(let ((pair RegPair (smul_wide x y)))
|
|
(copy_reg $I64 (regpair_hi pair))))
|
|
|
|
;; Multiply high part signed, vector types with 8-, 16-, or 32-bit elements.
|
|
(rule (lower (has_type $I8X16 (smulhi x y))) (vec_smulhi $I8X16 x y))
|
|
(rule (lower (has_type $I16X8 (smulhi x y))) (vec_smulhi $I16X8 x y))
|
|
(rule (lower (has_type $I32X4 (smulhi x y))) (vec_smulhi $I32X4 x y))
|
|
|
|
;; Multiply high part unsigned, vector types with 64-bit elements.
|
|
;; Has to be scalarized.
|
|
(rule (lower (has_type $I64X2 (smulhi x y)))
|
|
(let ((pair_0 RegPair (smul_wide (vec_extract_lane $I64X2 x 0 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 0 (zero_reg))))
|
|
(res_0 Reg (copy_reg $I64 (regpair_hi pair_0)))
|
|
(pair_1 RegPair (smul_wide (vec_extract_lane $I64X2 x 1 (zero_reg))
|
|
(vec_extract_lane $I64X2 y 1 (zero_reg))))
|
|
(res_1 Reg (copy_reg $I64 (regpair_hi pair_1))))
|
|
(mov_to_vec128 $I64X2 res_0 res_1)))
|
|
|
|
|
|
;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Widening pairwise dot product of two vector registers.
|
|
(rule (lower (has_type dst_ty (widening_pairwise_dot_product_s
|
|
x @ (value_type src_ty) y)))
|
|
(vec_add dst_ty (vec_smul_even src_ty x y)
|
|
(vec_smul_odd src_ty x y)))
|
|
|
|
|
|
;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Fixed-point multiplication of two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (sqmul_round_sat x y)))
|
|
(vec_pack_ssat (vec_widen_type ty)
|
|
(sqmul_impl (vec_widen_type ty)
|
|
(vec_unpacks_high ty x)
|
|
(vec_unpacks_high ty y))
|
|
(sqmul_impl (vec_widen_type ty)
|
|
(vec_unpacks_low ty x)
|
|
(vec_unpacks_low ty y))))
|
|
|
|
;; Helper to perform the rounded multiply in the wider type.
|
|
(decl sqmul_impl (Type Reg Reg) Reg)
|
|
(rule (sqmul_impl $I32X4 x y)
|
|
(vec_ashr_imm $I32X4 (vec_add $I32X4 (vec_mul_impl $I32X4 x y)
|
|
(vec_imm_bit_mask $I32X4 17 17))
|
|
15))
|
|
(rule (sqmul_impl $I64X2 x y)
|
|
(vec_ashr_imm $I64X2 (vec_add $I64X2 (vec_mul_impl $I64X2 x y)
|
|
(vec_imm_bit_mask $I64X2 33 33))
|
|
31))
|
|
|
|
|
|
;;;; Rules for `udiv` and `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Divide two registers. The architecture provides combined udiv / urem
|
|
;; instructions with the following combination of data types:
|
|
;;
|
|
;; - 64-bit dividend (split across a 2x32-bit register pair),
|
|
;; 32-bit divisor (in a single input register)
|
|
;; 32-bit quotient & remainder (in a 2x32-bit register pair)
|
|
;;
|
|
;; - 128-bit dividend (split across a 2x64-bit register pair),
|
|
;; 64-bit divisor (in a single input register)
|
|
;; 64-bit quotient & remainder (in a 2x64-bit register pair)
|
|
;;
|
|
;; We use the first variant for 32-bit and smaller input types,
|
|
;; and the second variant for 64-bit input types.
|
|
|
|
;; Implement `udiv`.
|
|
(rule (lower (has_type (fits_in_64 ty) (udiv x y)))
|
|
(let (;; Look at the divisor to determine whether we need to generate
|
|
;; an explicit division-by zero check.
|
|
(DZcheck bool (zero_divisor_check_needed y))
|
|
;; Load up the dividend, by loading the input (possibly zero-
|
|
;; extended) input into the low half of the register pair,
|
|
;; and setting the high half to zero.
|
|
(ext_x RegPair (put_in_regpair_lo_zext32 x
|
|
(imm_regpair_hi (ty_ext32 ty) 0 (uninitialized_regpair))))
|
|
;; Load up the divisor, zero-extended if necessary.
|
|
(ext_y Reg (put_in_reg_zext32 y))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
;; Now actually perform the division-by zero check if necessary.
|
|
;; This cannot be done earlier than here, because the check
|
|
;; requires an already extended divisor value.
|
|
(_ Reg (maybe_trap_if_zero_divisor DZcheck ext_ty ext_y))
|
|
;; Emit the actual divide instruction.
|
|
(pair RegPair (udivmod ext_ty ext_x ext_y)))
|
|
;; The quotient can be found in the low half of the result.
|
|
(copy_reg ty (regpair_lo pair))))
|
|
|
|
;; Implement `urem`. Same as `udiv`, but finds the remainder in
|
|
;; the high half of the result register pair instead.
|
|
(rule (lower (has_type (fits_in_64 ty) (urem x y)))
|
|
(let ((DZcheck bool (zero_divisor_check_needed y))
|
|
(ext_x RegPair (put_in_regpair_lo_zext32 x
|
|
(imm_regpair_hi ty 0 (uninitialized_regpair))))
|
|
(ext_y Reg (put_in_reg_zext32 y))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(_ Reg (maybe_trap_if_zero_divisor DZcheck ext_ty ext_y))
|
|
(pair RegPair (udivmod ext_ty ext_x ext_y)))
|
|
(copy_reg ty (regpair_hi pair))))
|
|
|
|
;; Determine whether we need to perform a divide-by-zero-check.
|
|
;;
|
|
;; If the `avoid_div_traps` flag is false, we never need to perform
|
|
;; that check; we can rely on the divide instruction itself to trap.
|
|
;;
|
|
;; If the `avoid_div_traps` flag is true, we perform the check explicitly.
|
|
;; This still can be omittted if the divisor is a non-zero immediate.
|
|
(decl zero_divisor_check_needed (Value) bool)
|
|
(rule (zero_divisor_check_needed (i64_from_value x))
|
|
(if (i64_nonzero x))
|
|
$false)
|
|
(rule (zero_divisor_check_needed (value_type (allow_div_traps))) $false)
|
|
(rule (zero_divisor_check_needed _) $true)
|
|
|
|
;; Perform the divide-by-zero check if required.
|
|
;; This is simply a compare-and-trap of the (extended) divisor against 0.
|
|
(decl maybe_trap_if_zero_divisor (bool Type Reg) Reg)
|
|
(rule (maybe_trap_if_zero_divisor $false _ _) (invalid_reg))
|
|
(rule (maybe_trap_if_zero_divisor $true ext_ty reg)
|
|
(icmps_simm16_and_trap ext_ty reg 0
|
|
(intcc_as_cond (IntCC.Equal))
|
|
(trap_code_division_by_zero)))
|
|
|
|
|
|
;;;; Rules for `sdiv` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Divide two registers. The architecture provides combined sdiv / srem
|
|
;; instructions with the following combination of data types:
|
|
;;
|
|
;; - 64-bit dividend (in the low half of a 2x64-bit register pair),
|
|
;; 32-bit divisor (in a single input register)
|
|
;; 64-bit quotient & remainder (in a 2x64-bit register pair)
|
|
;;
|
|
;; - 64-bit dividend (in the low half of a 2x64-bit register pair),
|
|
;; 64-bit divisor (in a single input register)
|
|
;; 64-bit quotient & remainder (in a 2x64-bit register pair)
|
|
;;
|
|
;; We use the first variant for 32-bit and smaller input types,
|
|
;; and the second variant for 64-bit input types.
|
|
|
|
;; Implement `sdiv`.
|
|
(rule (lower (has_type (fits_in_64 ty) (sdiv x y)))
|
|
(let (;; Look at the divisor to determine whether we need to generate
|
|
;; explicit division-by-zero and/or integer-overflow checks.
|
|
(DZcheck bool (zero_divisor_check_needed y))
|
|
(OFcheck bool (div_overflow_check_needed y))
|
|
;; Load up the dividend (sign-extended to 64-bit) into the low
|
|
;; half of a register pair (the high half remains uninitialized).
|
|
(ext_x RegPair (put_in_regpair_lo_sext64 x (uninitialized_regpair)))
|
|
;; Load up the divisor (sign-extended if necessary).
|
|
(ext_y Reg (put_in_reg_sext32 y))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
;; Perform division-by-zero check (same as for `udiv`).
|
|
(_ Reg (maybe_trap_if_zero_divisor DZcheck ext_ty ext_y))
|
|
;; Perform integer-overflow check if necessary.
|
|
(_ Reg (maybe_trap_if_sdiv_overflow OFcheck ext_ty ty ext_x ext_y))
|
|
;; Emit the actual divide instruction.
|
|
(pair RegPair (sdivmod ext_ty ext_x ext_y)))
|
|
;; The quotient can be found in the low half of the result.
|
|
(copy_reg ty (regpair_lo pair))))
|
|
|
|
;; Implement `srem`. Same as `sdiv`, but finds the remainder in
|
|
;; the high half of the result register pair instead. Also, handle
|
|
;; the integer overflow case differently, see below.
|
|
(rule (lower (has_type (fits_in_64 ty) (srem x y)))
|
|
(let ((DZcheck bool (zero_divisor_check_needed y))
|
|
(OFcheck bool (div_overflow_check_needed y))
|
|
(ext_x RegPair (put_in_regpair_lo_sext64 x (uninitialized_regpair)))
|
|
(ext_y Reg (put_in_reg_sext32 y))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(_ Reg (maybe_trap_if_zero_divisor DZcheck ext_ty ext_y))
|
|
(checked_x RegPair (maybe_avoid_srem_overflow OFcheck ext_ty ext_x ext_y))
|
|
(pair RegPair (sdivmod ext_ty checked_x ext_y)))
|
|
(copy_reg ty (regpair_hi pair))))
|
|
|
|
;; Determine whether we need to perform an integer-overflow check.
|
|
;;
|
|
;; We never rely on the divide instruction itself to trap; while that trap
|
|
;; would indeed happen, we have no way of signalling two different trap
|
|
;; conditions from the same instruction. By explicity checking for the
|
|
;; integer-overflow case ahead of time, any hardware trap in the divide
|
|
;; instruction is guaranteed to indicate divison-by-zero.
|
|
;;
|
|
;; In addition, for types smaller than 64 bits we would have to perform
|
|
;; the check explicitly anyway, since the instruction provides a 64-bit
|
|
;; quotient and only traps if *that* overflows.
|
|
;;
|
|
;; However, the only case where integer overflow can occur is if the
|
|
;; minimum (signed) integer value is divided by -1, so if the divisor
|
|
;; is any immediate different from -1, the check can be omitted.
|
|
(decl div_overflow_check_needed (Value) bool)
|
|
(rule (div_overflow_check_needed (i64_from_value x))
|
|
(if (i64_not_neg1 x))
|
|
$false)
|
|
(rule (div_overflow_check_needed _) $true)
|
|
|
|
;; Perform the integer-overflow check if necessary. This implements:
|
|
;;
|
|
;; if divisor == INT_MIN && dividend == -1 { trap }
|
|
;;
|
|
;; but to avoid introducing control flow, it is actually done as:
|
|
;;
|
|
;; if ((divisor ^ INT_MAX) & dividend) == -1 { trap }
|
|
;;
|
|
;; instead, using a single conditional trap instruction.
|
|
(decl maybe_trap_if_sdiv_overflow (bool Type Type RegPair Reg) Reg)
|
|
(rule (maybe_trap_if_sdiv_overflow $false ext_ty _ _ _) (invalid_reg))
|
|
(rule (maybe_trap_if_sdiv_overflow $true ext_ty ty x y)
|
|
(let ((int_max Reg (imm ext_ty (int_max ty)))
|
|
(reg Reg (and_reg ext_ty (xor_reg ext_ty int_max
|
|
(regpair_lo x)) y)))
|
|
(icmps_simm16_and_trap ext_ty reg -1
|
|
(intcc_as_cond (IntCC.Equal))
|
|
(trap_code_integer_overflow))))
|
|
(decl int_max (Type) u64)
|
|
(rule (int_max $I8) 0x7f)
|
|
(rule (int_max $I16) 0x7fff)
|
|
(rule (int_max $I32) 0x7fffffff)
|
|
(rule (int_max $I64) 0x7fffffffffffffff)
|
|
|
|
;; When performing `srem`, we do not want to trap in the
|
|
;; integer-overflow scenario, because it is only the quotient
|
|
;; that overflows, not the remainder.
|
|
;;
|
|
;; For types smaller than 64 bits, we can simply let the
|
|
;; instruction execute, since (as above) it will never trap.
|
|
;;
|
|
;; For 64-bit inputs, we check whether the divisor is -1, and
|
|
;; if so simply replace the dividend by zero, which will give
|
|
;; the correct result, since any value modulo -1 is zero.
|
|
;;
|
|
;; (We could in fact avoid executing the divide instruction
|
|
;; at all in this case, but that would require introducing
|
|
;; control flow.)
|
|
(decl maybe_avoid_srem_overflow (bool Type RegPair Reg) RegPair)
|
|
(rule (maybe_avoid_srem_overflow $false _ x _) x)
|
|
(rule (maybe_avoid_srem_overflow $true $I32 x _) x)
|
|
(rule (maybe_avoid_srem_overflow $true $I64 x y)
|
|
(cmov_imm_regpair_lo $I64 (icmps_simm16 $I64 y -1)
|
|
(intcc_as_cond (IntCC.Equal)) 0 x))
|
|
|
|
|
|
;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift left, shift amount in register.
|
|
(rule (lower (has_type (fits_in_64 ty) (ishl x y)))
|
|
(let ((masked_amt Reg (mask_amt_reg ty y)))
|
|
(lshl_reg ty x masked_amt)))
|
|
|
|
;; Shift left, immediate shift amount.
|
|
(rule (lower (has_type (fits_in_64 ty) (ishl x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(lshl_imm ty x masked_amt)))
|
|
|
|
;; Vector shift left, shift amount in register.
|
|
(rule (lower (has_type (ty_vec128 ty) (ishl x y)))
|
|
(vec_lshl_reg ty x y))
|
|
|
|
;; Vector shift left, immediate shift amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (ishl x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(vec_lshl_imm ty x masked_amt)))
|
|
|
|
|
|
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift right logical, shift amount in register.
|
|
;; For types smaller than 32-bit, the input value must be zero-extended.
|
|
(rule (lower (has_type (fits_in_64 ty) (ushr x y)))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(masked_amt Reg (mask_amt_reg ty y)))
|
|
(lshr_reg (ty_ext32 ty) ext_reg masked_amt)))
|
|
|
|
;; Shift right logical, immediate shift amount.
|
|
;; For types smaller than 32-bit, the input value must be zero-extended.
|
|
(rule (lower (has_type (fits_in_64 ty) (ushr x (i64_from_value y))))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(masked_amt u8 (mask_amt_imm ty y)))
|
|
(lshr_imm (ty_ext32 ty) ext_reg masked_amt)))
|
|
|
|
;; Vector shift right logical, shift amount in register.
|
|
(rule (lower (has_type (ty_vec128 ty) (ushr x y)))
|
|
(vec_lshr_reg ty x y))
|
|
|
|
;; Vector shift right logical, immediate shift amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (ushr x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(vec_lshr_imm ty x masked_amt)))
|
|
|
|
|
|
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift right arithmetic, shift amount in register.
|
|
;; For types smaller than 32-bit, the input value must be sign-extended.
|
|
(rule (lower (has_type (fits_in_64 ty) (sshr x y)))
|
|
(let ((ext_reg Reg (put_in_reg_sext32 x))
|
|
(masked_amt Reg (mask_amt_reg ty y)))
|
|
(ashr_reg (ty_ext32 ty) ext_reg masked_amt)))
|
|
|
|
;; Shift right arithmetic, immediate shift amount.
|
|
;; For types smaller than 32-bit, the input value must be sign-extended.
|
|
(rule (lower (has_type (fits_in_64 ty) (sshr x (i64_from_value y))))
|
|
(let ((ext_reg Reg (put_in_reg_sext32 x))
|
|
(masked_amt u8 (mask_amt_imm ty y)))
|
|
(ashr_imm (ty_ext32 ty) ext_reg masked_amt)))
|
|
|
|
;; Vector shift right arithmetic, shift amount in register.
|
|
(rule (lower (has_type (ty_vec128 ty) (sshr x y)))
|
|
(vec_ashr_reg ty x y))
|
|
|
|
;; Vector shift right arithmetic, immediate shift amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (sshr x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(vec_ashr_imm ty x masked_amt)))
|
|
|
|
|
|
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Rotate left, shift amount in register. 32-bit or 64-bit types.
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotl x y)))
|
|
(rot_reg ty x y))
|
|
|
|
;; Rotate left arithmetic, immediate shift amount. 32-bit or 64-bit types.
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotl x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(rot_imm ty x masked_amt)))
|
|
|
|
;; Rotate left, shift amount in register. 8-bit or 16-bit types.
|
|
;; Implemented via a pair of 32-bit shifts on the zero-extended input.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotl x y)))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(pos_amt Reg y)
|
|
(neg_amt Reg (neg_reg ty pos_amt))
|
|
(masked_pos_amt Reg (mask_amt_reg ty pos_amt))
|
|
(masked_neg_amt Reg (mask_amt_reg ty neg_amt)))
|
|
(or_reg ty (lshl_reg ext_ty ext_reg masked_pos_amt)
|
|
(lshr_reg ext_ty ext_reg masked_neg_amt))))
|
|
|
|
;; Rotate left, immediate shift amount. 8-bit or 16-bit types.
|
|
;; Implemented via a pair of 32-bit shifts on the zero-extended input.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotl x (and (i64_from_value pos_amt)
|
|
(i64_from_negated_value neg_amt)))))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(masked_pos_amt u8 (mask_amt_imm ty pos_amt))
|
|
(masked_neg_amt u8 (mask_amt_imm ty neg_amt)))
|
|
(or_reg ty (lshl_imm ext_ty ext_reg masked_pos_amt)
|
|
(lshr_imm ext_ty ext_reg masked_neg_amt))))
|
|
|
|
;; Vector rotate left, shift amount in register.
|
|
(rule (lower (has_type (ty_vec128 ty) (rotl x y)))
|
|
(vec_rot_reg ty x y))
|
|
|
|
;; Vector rotate left, immediate shift amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (rotl x (i64_from_value y))))
|
|
(let ((masked_amt u8 (mask_amt_imm ty y)))
|
|
(vec_rot_imm ty x masked_amt)))
|
|
|
|
|
|
;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Rotate right, shift amount in register. 32-bit or 64-bit types.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotr x y)))
|
|
(let ((negated_amt Reg (neg_reg ty y)))
|
|
(rot_reg ty x negated_amt)))
|
|
|
|
;; Rotate right arithmetic, immediate shift amount. 32-bit or 64-bit types.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotr x (i64_from_negated_value y))))
|
|
(let ((negated_amt u8 (mask_amt_imm ty y)))
|
|
(rot_imm ty x negated_amt)))
|
|
|
|
;; Rotate right, shift amount in register. 8-bit or 16-bit types.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotr x y)))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(pos_amt Reg y)
|
|
(neg_amt Reg (neg_reg ty pos_amt))
|
|
(masked_pos_amt Reg (mask_amt_reg ty pos_amt))
|
|
(masked_neg_amt Reg (mask_amt_reg ty neg_amt)))
|
|
(or_reg ty (lshl_reg ext_ty ext_reg masked_neg_amt)
|
|
(lshr_reg ext_ty ext_reg masked_pos_amt))))
|
|
|
|
;; Rotate right, immediate shift amount. 8-bit or 16-bit types.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotr x (and (i64_from_value pos_amt)
|
|
(i64_from_negated_value neg_amt)))))
|
|
(let ((ext_reg Reg (put_in_reg_zext32 x))
|
|
(ext_ty Type (ty_ext32 ty))
|
|
(masked_pos_amt u8 (mask_amt_imm ty pos_amt))
|
|
(masked_neg_amt u8 (mask_amt_imm ty neg_amt)))
|
|
(or_reg ty (lshl_imm ext_ty ext_reg masked_neg_amt)
|
|
(lshr_imm ext_ty ext_reg masked_pos_amt))))
|
|
|
|
;; Vector rotate right, shift amount in register.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (rotr x y)))
|
|
(let ((negated_amt Reg (neg_reg $I32 y)))
|
|
(vec_rot_reg ty x negated_amt)))
|
|
|
|
;; Vector rotate right, immediate shift amount.
|
|
;; Implemented as rotate left with negated rotate amount.
|
|
(rule (lower (has_type (ty_vec128 ty) (rotr x (i64_from_negated_value y))))
|
|
(let ((negated_amt u8 (mask_amt_imm ty y)))
|
|
(vec_rot_imm ty x negated_amt)))
|
|
|
|
|
|
;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Always a no-op.
|
|
(rule (lower (ireduce x))
|
|
x)
|
|
|
|
|
|
;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 16- or 32-bit target types.
|
|
(rule (lower (has_type (gpr32_ty _ty) (uextend x)))
|
|
(put_in_reg_zext32 x))
|
|
|
|
;; 64-bit target types.
|
|
(rule (lower (has_type (gpr64_ty _ty) (uextend x)))
|
|
(put_in_reg_zext64 x))
|
|
|
|
|
|
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 16- or 32-bit target types.
|
|
(rule (lower (has_type (gpr32_ty _ty) (sextend x)))
|
|
(put_in_reg_sext32 x))
|
|
|
|
;; 64-bit target types.
|
|
(rule (lower (has_type (gpr64_ty _ty) (sextend x)))
|
|
(put_in_reg_sext64 x))
|
|
|
|
|
|
;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y))
|
|
(vec_pack_ssat ty y x))
|
|
|
|
|
|
;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y))
|
|
(vec_pack_usat ty y x))
|
|
|
|
|
|
;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y))
|
|
(let ((zero Reg (vec_imm ty 0)))
|
|
(vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero))))
|
|
|
|
|
|
;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (swiden_low x @ (value_type (ty_vec128 ty))))
|
|
(vec_unpacks_low ty x))
|
|
|
|
|
|
;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (swiden_high x @ (value_type (ty_vec128 ty))))
|
|
(vec_unpacks_high ty x))
|
|
|
|
|
|
;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (uwiden_low x @ (value_type (ty_vec128 ty))))
|
|
(vec_unpacku_low ty x))
|
|
|
|
|
|
;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (uwiden_high x @ (value_type (ty_vec128 ty))))
|
|
(vec_unpacku_high ty x))
|
|
|
|
|
|
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; z15 version using a single instruction (NOR).
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bnot x)))
|
|
(let ((rx Reg x))
|
|
(not_or_reg ty rx rx)))
|
|
|
|
;; z14 version using XOR with -1.
|
|
(rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bnot x)))
|
|
(not_reg ty x))
|
|
|
|
;; Vector version using vector NOR.
|
|
(rule (lower (has_type (ty_vec128 ty) (bnot x)))
|
|
(vec_not ty x))
|
|
|
|
|
|
;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; And two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (band x y)))
|
|
(and_reg ty x y))
|
|
|
|
;; And a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (band x (uimm16shifted_from_inverted_value y))))
|
|
(and_uimm16shifted ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (band (uimm16shifted_from_inverted_value x) y)))
|
|
(and_uimm16shifted ty y x))
|
|
(rule (lower (has_type (fits_in_64 ty) (band x (uimm32shifted_from_inverted_value y))))
|
|
(and_uimm32shifted ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (band (uimm32shifted_from_inverted_value x) y)))
|
|
(and_uimm32shifted ty y x))
|
|
|
|
;; And a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (band x (sinkable_load_32_64 y))))
|
|
(and_mem ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (band (sinkable_load_32_64 x) y)))
|
|
(and_mem ty y (sink_load x)))
|
|
|
|
;; And two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (band x y)))
|
|
(vec_and ty x y))
|
|
|
|
;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Or two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (bor x y)))
|
|
(or_reg ty x y))
|
|
|
|
;; Or a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (bor x (uimm16shifted_from_value y))))
|
|
(or_uimm16shifted ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (bor (uimm16shifted_from_value x) y)))
|
|
(or_uimm16shifted ty y x))
|
|
(rule (lower (has_type (fits_in_64 ty) (bor x (uimm32shifted_from_value y))))
|
|
(or_uimm32shifted ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (bor (uimm32shifted_from_value x) y)))
|
|
(or_uimm32shifted ty y x))
|
|
|
|
;; Or a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (bor x (sinkable_load_32_64 y))))
|
|
(or_mem ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (bor (sinkable_load_32_64 x) y)))
|
|
(or_mem ty y (sink_load x)))
|
|
|
|
;; Or two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (bor x y)))
|
|
(vec_or ty x y))
|
|
|
|
|
|
;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Xor two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor x y)))
|
|
(xor_reg ty x y))
|
|
|
|
;; Xor a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor x (uimm32shifted_from_value y))))
|
|
(xor_uimm32shifted ty x y))
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor (uimm32shifted_from_value x) y)))
|
|
(xor_uimm32shifted ty y x))
|
|
|
|
;; Xor a register and memory (32/64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor x (sinkable_load_32_64 y))))
|
|
(xor_mem ty x (sink_load y)))
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor (sinkable_load_32_64 x) y)))
|
|
(xor_mem ty y (sink_load x)))
|
|
|
|
;; Xor two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (bxor x y)))
|
|
(vec_xor ty x y))
|
|
|
|
|
|
;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; z15 version using a single instruction.
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (band_not x y)))
|
|
(and_not_reg ty x y))
|
|
|
|
;; z14 version using XOR with -1.
|
|
(rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (band_not x y)))
|
|
(and_reg ty x (not_reg ty y)))
|
|
|
|
;; And-not two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (band_not x y)))
|
|
(vec_and_not ty x y))
|
|
|
|
|
|
;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; z15 version using a single instruction.
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bor_not x y)))
|
|
(or_not_reg ty x y))
|
|
|
|
;; z14 version using XOR with -1.
|
|
(rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bor_not x y)))
|
|
(or_reg ty x (not_reg ty y)))
|
|
|
|
;; Or-not two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (bor_not x y)))
|
|
(vec_or_not ty x y))
|
|
|
|
|
|
;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; z15 version using a single instruction.
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bxor_not x y)))
|
|
(not_xor_reg ty x y))
|
|
|
|
;; z14 version using XOR with -1.
|
|
(rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bxor_not x y)))
|
|
(not_reg ty (xor_reg ty x y)))
|
|
|
|
;; Xor-not two vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (bxor_not x y)))
|
|
(vec_not_xor ty x y))
|
|
|
|
|
|
;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; z15 version using a NAND instruction.
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bitselect x y z)))
|
|
(let ((rx Reg x)
|
|
(if_true Reg (and_reg ty y rx))
|
|
(if_false Reg (and_not_reg ty z rx)))
|
|
(or_reg ty if_false if_true)))
|
|
|
|
;; z14 version using XOR with -1.
|
|
(rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bitselect x y z)))
|
|
(let ((rx Reg x)
|
|
(if_true Reg (and_reg ty y rx))
|
|
(if_false Reg (and_reg ty z (not_reg ty rx))))
|
|
(or_reg ty if_false if_true)))
|
|
|
|
;; Bitselect vector registers.
|
|
(rule (lower (has_type (ty_vec128 ty) (bitselect x y z)))
|
|
(vec_select ty y z x))
|
|
|
|
|
|
;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Vector select.
|
|
(rule (lower (has_type (ty_vec128 ty) (vselect x y z)))
|
|
(vec_select ty y z x))
|
|
|
|
|
|
;;;; Rules for `breduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Always a no-op.
|
|
(rule (lower (breduce x))
|
|
x)
|
|
|
|
|
|
;;;; Rules for `bextend` and `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Use a common helper to type cast bools to either bool or integer types.
|
|
(decl cast_bool (Type Value) Reg)
|
|
(rule (lower (has_type ty (bextend x)))
|
|
(cast_bool ty x))
|
|
(rule (lower (has_type ty (bmask x)))
|
|
(cast_bool ty x))
|
|
|
|
;; If the target has the same or a smaller size than the source, it's a no-op.
|
|
(rule (cast_bool $B1 x @ (value_type $B1)) x)
|
|
(rule (cast_bool $B1 x @ (value_type $B8)) x)
|
|
(rule (cast_bool $B8 x @ (value_type $B8)) x)
|
|
(rule (cast_bool $I8 x @ (value_type $B8)) x)
|
|
(rule (cast_bool (fits_in_16 _ty) x @ (value_type $B16)) x)
|
|
(rule (cast_bool (fits_in_32 _ty) x @ (value_type $B32)) x)
|
|
(rule (cast_bool (fits_in_64 _ty) x @ (value_type $B64)) x)
|
|
|
|
;; Single-bit values are sign-extended via a pair of shifts.
|
|
(rule (cast_bool (gpr32_ty ty) x @ (value_type $B1))
|
|
(ashr_imm $I32 (lshl_imm $I32 x 31) 31))
|
|
(rule (cast_bool (gpr64_ty ty) x @ (value_type $B1))
|
|
(ashr_imm $I64 (lshl_imm $I64 x 63) 63))
|
|
|
|
;; Other values are just sign-extended normally.
|
|
(rule (cast_bool (gpr32_ty _ty) x @ (value_type $B8))
|
|
(sext32_reg $I8 x))
|
|
(rule (cast_bool (gpr32_ty _ty) x @ (value_type $B16))
|
|
(sext32_reg $I16 x))
|
|
(rule (cast_bool (gpr64_ty _ty) x @ (value_type $B8))
|
|
(sext64_reg $I8 x))
|
|
(rule (cast_bool (gpr64_ty _ty) x @ (value_type $B16))
|
|
(sext64_reg $I16 x))
|
|
(rule (cast_bool (gpr64_ty _ty) x @ (value_type $B32))
|
|
(sext64_reg $I32 x))
|
|
|
|
|
|
;;;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Mask with 1 to get a 0/1 result (8- or 16-bit types).
|
|
(rule (lower (has_type (fits_in_16 ty) (bint x)))
|
|
(and_uimm16shifted ty x (uimm16shifted 1 0)))
|
|
|
|
;; Mask with 1 to get a 0/1 result (32-bit types).
|
|
(rule (lower (has_type (fits_in_32 ty) (bint x)))
|
|
(and_uimm32shifted ty x (uimm32shifted 1 0)))
|
|
|
|
;; Mask with 1 to get a 0/1 result (64-bit types).
|
|
(rule (lower (has_type (fits_in_64 ty) (bint x)))
|
|
(and_reg ty x (imm ty 1)))
|
|
|
|
|
|
;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; The FLOGR hardware instruction always operates on the full 64-bit register.
|
|
;; We can zero-extend smaller types, but then we have to compensate for the
|
|
;; additional leading zero bits the instruction will actually see.
|
|
(decl clz_offset (Type Reg) Reg)
|
|
(rule (clz_offset $I8 x) (add_simm16 $I8 x -56))
|
|
(rule (clz_offset $I16 x) (add_simm16 $I16 x -48))
|
|
(rule (clz_offset $I32 x) (add_simm16 $I32 x -32))
|
|
(rule (clz_offset $I64 x) (copy_reg $I64 x))
|
|
|
|
;; Count leading zeros, via FLOGR on an input zero-extended to 64 bits,
|
|
;; with the result compensated for the extra bits.
|
|
(rule (lower (has_type (fits_in_64 ty) (clz x)))
|
|
(let ((ext_reg Reg (put_in_reg_zext64 x))
|
|
;; Ask for a value of 64 in the all-zero 64-bit input case.
|
|
;; After compensation this will match the expected semantics.
|
|
(clz RegPair (clz_reg 64 ext_reg)))
|
|
(clz_offset ty (regpair_hi clz))))
|
|
|
|
|
|
;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; The result of cls is not supposed to count the sign bit itself, just
|
|
;; additional copies of it. Therefore, when computing cls in terms of clz,
|
|
;; we need to subtract one. Fold this into the offset computation.
|
|
(decl cls_offset (Type Reg) Reg)
|
|
(rule (cls_offset $I8 x) (add_simm16 $I8 x -57))
|
|
(rule (cls_offset $I16 x) (add_simm16 $I16 x -49))
|
|
(rule (cls_offset $I32 x) (add_simm16 $I32 x -33))
|
|
(rule (cls_offset $I64 x) (add_simm16 $I64 x -1))
|
|
|
|
;; Count leading sign-bit copies. We don't have any instruction for that,
|
|
;; so we instead count the leading zeros after inverting the input if negative,
|
|
;; i.e. computing
|
|
;; cls(x) == clz(x ^ (x >> 63)) - 1
|
|
;; where x is the sign-extended input.
|
|
(rule (lower (has_type (fits_in_64 ty) (cls x)))
|
|
(let ((ext_reg Reg (put_in_reg_sext64 x))
|
|
(signbit_copies Reg (ashr_imm $I64 ext_reg 63))
|
|
(inv_reg Reg (xor_reg $I64 ext_reg signbit_copies))
|
|
(clz RegPair (clz_reg 64 inv_reg)))
|
|
(cls_offset ty (regpair_hi clz))))
|
|
|
|
|
|
;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; To count trailing zeros, we find the last bit set in the input via (x & -x),
|
|
;; count the leading zeros of that value, and subtract from 63:
|
|
;;
|
|
;; ctz(x) == 63 - clz(x & -x)
|
|
;;
|
|
;; This works for all cases except a zero input, where the above formula would
|
|
;; return -1, but we are expected to return the type size. The compensation
|
|
;; for this case is handled differently for 64-bit types vs. smaller types.
|
|
|
|
;; For smaller types, we simply ensure that the extended 64-bit input is
|
|
;; never zero by setting a "guard bit" in the position corresponding to
|
|
;; the input type size. This way the 64-bit algorithm above will handle
|
|
;; that case correctly automatically.
|
|
(rule (lower (has_type (gpr32_ty ty) (ctz x)))
|
|
(let ((rx Reg (or_uimm16shifted $I64 x (ctz_guardbit ty)))
|
|
(lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx)))
|
|
(clz RegPair (clz_reg 64 lastbit)))
|
|
(sub_reg ty (imm ty 63) (regpair_hi clz))))
|
|
|
|
(decl ctz_guardbit (Type) UImm16Shifted)
|
|
(rule (ctz_guardbit $I8) (uimm16shifted 256 0))
|
|
(rule (ctz_guardbit $I16) (uimm16shifted 1 16))
|
|
(rule (ctz_guardbit $I32) (uimm16shifted 1 32))
|
|
|
|
;; For 64-bit types, the FLOGR instruction will indicate the zero input case
|
|
;; via its condition code. We check for that and replace the instruction
|
|
;; result with the value -1 via a conditional move, which will then lead to
|
|
;; the correct result after the final subtraction from 63.
|
|
(rule (lower (has_type (gpr64_ty _ty) (ctz x)))
|
|
(let ((rx Reg x)
|
|
(lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx)))
|
|
(clz RegPair (clz_reg -1 lastbit)))
|
|
(sub_reg $I64 (imm $I64 63) (regpair_hi clz))))
|
|
|
|
|
|
;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Population count for 8-bit types is supported by the POPCNT instruction.
|
|
(rule (lower (has_type $I8 (popcnt x)))
|
|
(popcnt_byte x))
|
|
|
|
;; On z15, the POPCNT instruction has a variant to compute a full 64-bit
|
|
;; population count, which we also use for 16- and 32-bit types.
|
|
(rule (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (popcnt x)))
|
|
(popcnt_reg (put_in_reg_zext64 x)))
|
|
|
|
;; On z14, we use the regular POPCNT, which computes the population count
|
|
;; of each input byte separately, so we need to accumulate those partial
|
|
;; results via a series of log2(type size in bytes) - 1 additions. We
|
|
;; accumulate in the high byte, so that a final right shift will zero out
|
|
;; any unrelated bits to give a clean result. (This does not work with
|
|
;; $I16, where we instead accumulate in the low byte and clear high bits
|
|
;; via an explicit and operation.)
|
|
|
|
(rule (lower (has_type (and (mie2_disabled) $I16) (popcnt x)))
|
|
(let ((cnt2 Reg (popcnt_byte x))
|
|
(cnt1 Reg (add_reg $I32 cnt2 (lshr_imm $I32 cnt2 8))))
|
|
(and_uimm16shifted $I32 cnt1 (uimm16shifted 255 0))))
|
|
|
|
(rule (lower (has_type (and (mie2_disabled) $I32) (popcnt x)))
|
|
(let ((cnt4 Reg (popcnt_byte x))
|
|
(cnt2 Reg (add_reg $I32 cnt4 (lshl_imm $I32 cnt4 16)))
|
|
(cnt1 Reg (add_reg $I32 cnt2 (lshl_imm $I32 cnt2 8))))
|
|
(lshr_imm $I32 cnt1 24)))
|
|
|
|
(rule (lower (has_type (and (mie2_disabled) $I64) (popcnt x)))
|
|
(let ((cnt8 Reg (popcnt_byte x))
|
|
(cnt4 Reg (add_reg $I64 cnt8 (lshl_imm $I64 cnt8 32)))
|
|
(cnt2 Reg (add_reg $I64 cnt4 (lshl_imm $I64 cnt4 16)))
|
|
(cnt1 Reg (add_reg $I64 cnt2 (lshl_imm $I64 cnt2 8))))
|
|
(lshr_imm $I64 cnt1 56)))
|
|
|
|
;; Population count for vector types.
|
|
(rule (lower (has_type (ty_vec128 ty) (popcnt x)))
|
|
(vec_popcnt ty x))
|
|
|
|
|
|
;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Add two registers.
|
|
(rule (lower (has_type ty (fadd x y)))
|
|
(fadd_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Subtract two registers.
|
|
(rule (lower (has_type ty (fsub x y)))
|
|
(fsub_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Multiply two registers.
|
|
(rule (lower (has_type ty (fmul x y)))
|
|
(fmul_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Divide two registers.
|
|
(rule (lower (has_type ty (fdiv x y)))
|
|
(fdiv_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Minimum of two registers.
|
|
(rule (lower (has_type ty (fmin x y)))
|
|
(fmin_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Maximum of two registers.
|
|
(rule (lower (has_type ty (fmax x y)))
|
|
(fmax_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Minimum of two registers.
|
|
(rule (lower (has_type ty (fmin_pseudo x y)))
|
|
(fmin_pseudo_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Maximum of two registers.
|
|
(rule (lower (has_type ty (fmax_pseudo x y)))
|
|
(fmax_pseudo_reg ty x y))
|
|
|
|
|
|
;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Copysign of two registers.
|
|
(rule (lower (has_type $F32 (fcopysign x y)))
|
|
(vec_select $F32 x y (imm $F32 2147483647)))
|
|
(rule (lower (has_type $F64 (fcopysign x y)))
|
|
(vec_select $F64 x y (imm $F64 9223372036854775807)))
|
|
(rule (lower (has_type $F32X4 (fcopysign x y)))
|
|
(vec_select $F32X4 x y (vec_imm_bit_mask $F32X4 1 31)))
|
|
(rule (lower (has_type $F64X2 (fcopysign x y)))
|
|
(vec_select $F64X2 x y (vec_imm_bit_mask $F64X2 1 63)))
|
|
|
|
|
|
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Multiply-and-add of three registers.
|
|
(rule (lower (has_type ty (fma x y z)))
|
|
(fma_reg ty x y z))
|
|
|
|
|
|
;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Square root of a register.
|
|
(rule (lower (has_type ty (sqrt x)))
|
|
(sqrt_reg ty x))
|
|
|
|
|
|
;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Negated value of a register.
|
|
(rule (lower (has_type ty (fneg x)))
|
|
(fneg_reg ty x))
|
|
|
|
|
|
;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Absolute value of a register.
|
|
(rule (lower (has_type ty (fabs x)))
|
|
(fabs_reg ty x))
|
|
|
|
|
|
;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Round value in a register towards positive infinity.
|
|
(rule (lower (has_type ty (ceil x)))
|
|
(ceil_reg ty x))
|
|
|
|
|
|
;;;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Round value in a register towards negative infinity.
|
|
(rule (lower (has_type ty (floor x)))
|
|
(floor_reg ty x))
|
|
|
|
|
|
;;;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Round value in a register towards zero.
|
|
(rule (lower (has_type ty (trunc x)))
|
|
(trunc_reg ty x))
|
|
|
|
|
|
;;;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Round value in a register towards nearest.
|
|
(rule (lower (has_type ty (nearest x)))
|
|
(nearest_reg ty x))
|
|
|
|
|
|
;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Promote a register.
|
|
(rule (lower (has_type (fits_in_64 dst_ty) (fpromote x @ (value_type src_ty))))
|
|
(fpromote_reg dst_ty src_ty x))
|
|
|
|
|
|
;;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Promote a register.
|
|
(rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4))))
|
|
(fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x)))
|
|
|
|
|
|
;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Demote a register.
|
|
(rule (lower (has_type (fits_in_64 dst_ty) (fdemote x @ (value_type src_ty))))
|
|
(fdemote_reg dst_ty src_ty (FpuRoundMode.Current) x))
|
|
|
|
|
|
;;;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Demote a register.
|
|
(rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2))))
|
|
(let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x)))
|
|
(vec_permute $F32X4 dst (vec_imm $F32X4 0)
|
|
(vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16
|
|
0 1 2 3 8 9 10 11)))))
|
|
|
|
|
|
;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a 32-bit or smaller unsigned integer to $F32 (z15 instruction).
|
|
(rule (lower (has_type $F32
|
|
(fcvt_from_uint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty))))))
|
|
(fcvt_from_uint_reg $F32 (FpuRoundMode.ToNearestTiesToEven)
|
|
(put_in_reg_zext32 x)))
|
|
|
|
;; Convert a 64-bit or smaller unsigned integer to $F32, via an intermediate $F64.
|
|
(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_64 ty)))))
|
|
(fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_uint_reg $F64 (FpuRoundMode.ShorterPrecision)
|
|
(put_in_reg_zext64 x))))
|
|
|
|
;; Convert a 64-bit or smaller unsigned integer to $F64.
|
|
(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_64 ty)))))
|
|
(fcvt_from_uint_reg $F64 (FpuRoundMode.ToNearestTiesToEven)
|
|
(put_in_reg_zext64 x)))
|
|
|
|
;; Convert $I32X4 to $F32X4 (z15 instruction).
|
|
(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4)
|
|
(fcvt_from_uint x @ (value_type $I32X4))))
|
|
(fcvt_from_uint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x))
|
|
|
|
;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14).
|
|
(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4)
|
|
(fcvt_from_uint x @ (value_type $I32X4))))
|
|
(vec_permute $F32X4
|
|
(fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision)
|
|
(vec_unpacku_high $I32X4 x)))
|
|
(fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision)
|
|
(vec_unpacku_low $I32X4 x)))
|
|
(vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27))))
|
|
|
|
;; Convert $I64X2 to $F64X2.
|
|
(rule (lower (has_type $F64X2 (fcvt_from_uint x @ (value_type $I64X2))))
|
|
(fcvt_from_uint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x))
|
|
|
|
|
|
;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a 32-bit or smaller signed integer to $F32 (z15 instruction).
|
|
(rule (lower (has_type $F32
|
|
(fcvt_from_sint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty))))))
|
|
(fcvt_from_sint_reg $F32 (FpuRoundMode.ToNearestTiesToEven)
|
|
(put_in_reg_sext32 x)))
|
|
|
|
;; Convert a 64-bit or smaller signed integer to $F32, via an intermediate $F64.
|
|
(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_64 ty)))))
|
|
(fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_sint_reg $F64 (FpuRoundMode.ShorterPrecision)
|
|
(put_in_reg_sext64 x))))
|
|
|
|
;; Convert a 64-bit or smaller signed integer to $F64.
|
|
(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_64 ty)))))
|
|
(fcvt_from_sint_reg $F64 (FpuRoundMode.ToNearestTiesToEven)
|
|
(put_in_reg_sext64 x)))
|
|
|
|
;; Convert $I32X4 to $F32X4 (z15 instruction).
|
|
(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4)
|
|
(fcvt_from_sint x @ (value_type $I32X4))))
|
|
(fcvt_from_sint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x))
|
|
|
|
;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14).
|
|
(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4)
|
|
(fcvt_from_sint x @ (value_type $I32X4))))
|
|
(vec_permute $F32X4
|
|
(fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision)
|
|
(vec_unpacks_high $I32X4 x)))
|
|
(fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven)
|
|
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision)
|
|
(vec_unpacks_low $I32X4 x)))
|
|
(vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27))))
|
|
|
|
;; Convert $I64X2 to $F64X2.
|
|
(rule (lower (has_type $F64X2 (fcvt_from_sint x @ (value_type $I64X2))))
|
|
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x))
|
|
|
|
|
|
;;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert the low half of a $I32X4 to a $F64X2.
|
|
(rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4))))
|
|
(fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven)
|
|
(vec_unpacks_low $I32X4 x)))
|
|
|
|
|
|
;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a scalar floating-point value in a register to an unsigned integer.
|
|
;; Traps if the input cannot be represented in the output type.
|
|
(rule (lower (has_type (fits_in_64 dst_ty)
|
|
(fcvt_to_uint x @ (value_type src_ty))))
|
|
(let ((src Reg (put_in_reg x))
|
|
;; First, check whether the input is a NaN, and trap if so.
|
|
(_ Reg (trap_if (fcmp_reg src_ty src src)
|
|
(floatcc_as_cond (FloatCC.Unordered))
|
|
(trap_code_bad_conversion_to_integer)))
|
|
;; Now check whether the input is out of range for the target type.
|
|
(_ Reg (trap_if (fcmp_reg src_ty src (fcvt_to_uint_ub src_ty dst_ty))
|
|
(floatcc_as_cond (FloatCC.GreaterThanOrEqual))
|
|
(trap_code_integer_overflow)))
|
|
(_ Reg (trap_if (fcmp_reg src_ty src (fcvt_to_uint_lb src_ty))
|
|
(floatcc_as_cond (FloatCC.LessThanOrEqual))
|
|
(trap_code_integer_overflow)))
|
|
;; Perform the conversion using the larger type size.
|
|
(flt_ty Type (fcvt_flt_ty dst_ty src_ty))
|
|
(src_ext Reg (fpromote_reg flt_ty src_ty src)))
|
|
(fcvt_to_uint_reg flt_ty (FpuRoundMode.ToZero) src_ext)))
|
|
|
|
|
|
;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a scalar floating-point value in a register to a signed integer.
|
|
;; Traps if the input cannot be represented in the output type.
|
|
(rule (lower (has_type (fits_in_64 dst_ty)
|
|
(fcvt_to_sint x @ (value_type src_ty))))
|
|
(let ((src Reg (put_in_reg x))
|
|
;; First, check whether the input is a NaN, and trap if so.
|
|
(_ Reg (trap_if (fcmp_reg src_ty src src)
|
|
(floatcc_as_cond (FloatCC.Unordered))
|
|
(trap_code_bad_conversion_to_integer)))
|
|
;; Now check whether the input is out of range for the target type.
|
|
(_ Reg (trap_if (fcmp_reg src_ty src (fcvt_to_sint_ub src_ty dst_ty))
|
|
(floatcc_as_cond (FloatCC.GreaterThanOrEqual))
|
|
(trap_code_integer_overflow)))
|
|
(_ Reg (trap_if (fcmp_reg src_ty src (fcvt_to_sint_lb src_ty dst_ty))
|
|
(floatcc_as_cond (FloatCC.LessThanOrEqual))
|
|
(trap_code_integer_overflow)))
|
|
;; Perform the conversion using the larger type size.
|
|
(flt_ty Type (fcvt_flt_ty dst_ty src_ty))
|
|
(src_ext Reg (fpromote_reg flt_ty src_ty src)))
|
|
;; Perform the conversion.
|
|
(fcvt_to_sint_reg flt_ty (FpuRoundMode.ToZero) src_ext)))
|
|
|
|
|
|
;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a scalar floating-point value in a register to an unsigned integer.
|
|
(rule (lower (has_type (fits_in_64 dst_ty)
|
|
(fcvt_to_uint_sat x @ (value_type src_ty))))
|
|
(let ((src Reg (put_in_reg x))
|
|
;; Perform the conversion using the larger type size.
|
|
(flt_ty Type (fcvt_flt_ty dst_ty src_ty))
|
|
(int_ty Type (fcvt_int_ty dst_ty src_ty))
|
|
(src_ext Reg (fpromote_reg flt_ty src_ty src))
|
|
(dst Reg (fcvt_to_uint_reg flt_ty (FpuRoundMode.ToZero) src_ext)))
|
|
;; Clamp the output to the destination type bounds.
|
|
(uint_sat_reg dst_ty int_ty dst)))
|
|
|
|
;; Convert $F32X4 to $I32X4 (z15 instruction).
|
|
(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4)
|
|
(fcvt_to_uint_sat x @ (value_type $F32X4))))
|
|
(fcvt_to_uint_reg $F32X4 (FpuRoundMode.ToZero) x))
|
|
|
|
;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14).
|
|
(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4)
|
|
(fcvt_to_uint_sat x @ (value_type $F32X4))))
|
|
(vec_pack_usat $I64X2
|
|
(fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero)
|
|
(fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 x x)))
|
|
(fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero)
|
|
(fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x)))))
|
|
|
|
;; Convert $F64X2 to $I64X2.
|
|
(rule (lower (has_type $I64X2 (fcvt_to_uint_sat x @ (value_type $F64X2))))
|
|
(fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) x))
|
|
|
|
|
|
;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Convert a scalar floating-point value in a register to a signed integer.
|
|
(rule (lower (has_type (fits_in_64 dst_ty)
|
|
(fcvt_to_sint_sat x @ (value_type src_ty))))
|
|
(let ((src Reg (put_in_reg x))
|
|
;; Perform the conversion using the larger type size.
|
|
(flt_ty Type (fcvt_flt_ty dst_ty src_ty))
|
|
(int_ty Type (fcvt_int_ty dst_ty src_ty))
|
|
(src_ext Reg (fpromote_reg flt_ty src_ty src))
|
|
(dst Reg (fcvt_to_sint_reg flt_ty (FpuRoundMode.ToZero) src_ext))
|
|
;; In most special cases, the Z instruction already yields the
|
|
;; result expected by Cranelift semantics. The only exception
|
|
;; it the case where the input was a NaN. We explicitly check
|
|
;; for that and force the output to 0 in that case.
|
|
(sat Reg (with_flags_reg (fcmp_reg src_ty src src)
|
|
(cmov_imm int_ty
|
|
(floatcc_as_cond (FloatCC.Unordered)) 0 dst))))
|
|
;; Clamp the output to the destination type bounds.
|
|
(sint_sat_reg dst_ty int_ty sat)))
|
|
|
|
;; Convert $F32X4 to $I32X4 (z15 instruction).
|
|
(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4)
|
|
(fcvt_to_sint_sat src @ (value_type $F32X4))))
|
|
;; See above for why we need to handle NaNs specially.
|
|
(vec_select $I32X4
|
|
(fcvt_to_sint_reg $F32X4 (FpuRoundMode.ToZero) src)
|
|
(vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src)))
|
|
|
|
;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14).
|
|
(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4)
|
|
(fcvt_to_sint_sat src @ (value_type $F32X4))))
|
|
;; See above for why we need to handle NaNs specially.
|
|
(vec_select $I32X4
|
|
(vec_pack_ssat $I64X2
|
|
(fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero)
|
|
(fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 src src)))
|
|
(fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero)
|
|
(fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 src src))))
|
|
(vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src)))
|
|
|
|
;; Convert $F64X2 to $I64X2.
|
|
(rule (lower (has_type $I64X2 (fcvt_to_sint_sat src @ (value_type $F64X2))))
|
|
;; See above for why we need to handle NaNs specially.
|
|
(vec_select $I64X2
|
|
(fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) src)
|
|
(vec_imm $I64X2 0) (vec_fcmpeq $F64X2 src src)))
|
|
|
|
|
|
;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Reinterpret a 64-bit integer value as floating-point.
|
|
(rule (lower (has_type $F64 (bitcast x @ (value_type $I64))))
|
|
(vec_insert_lane_undef $F64X2 x 0 (zero_reg)))
|
|
|
|
;; Reinterpret a 64-bit floating-point value as integer.
|
|
(rule (lower (has_type $I64 (bitcast x @ (value_type $F64))))
|
|
(vec_extract_lane $F64X2 x 0 (zero_reg)))
|
|
|
|
;; Reinterpret a 32-bit integer value as floating-point.
|
|
(rule (lower (has_type $F32 (bitcast x @ (value_type $I32))))
|
|
(vec_insert_lane_undef $F32X4 x 0 (zero_reg)))
|
|
|
|
;; Reinterpret a 32-bit floating-point value as integer.
|
|
(rule (lower (has_type $I32 (bitcast x @ (value_type $F32))))
|
|
(vec_extract_lane $F32X4 x 0 (zero_reg)))
|
|
|
|
|
|
;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Raw bitcast is always a no-op.
|
|
(rule (lower (raw_bitcast x)) x)
|
|
|
|
|
|
;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Insert vector lane from general-purpose register.
|
|
(rule (lower (insertlane x @ (value_type ty)
|
|
y @ (value_type in_ty)
|
|
(u8_from_uimm8 idx)))
|
|
(if (ty_int_bool_ref_scalar_64 in_ty))
|
|
(vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg)))
|
|
|
|
;; Insert vector lane from floating-point register.
|
|
(rule (lower (insertlane x @ (value_type ty)
|
|
y @ (value_type (ty_scalar_float _))
|
|
(u8_from_uimm8 idx)))
|
|
(vec_move_lane_and_insert ty x (be_lane_idx ty idx) y 0))
|
|
|
|
;; Insert vector lane from another vector lane.
|
|
(rule (lower (insertlane x @ (value_type ty)
|
|
(extractlane y (u8_from_uimm8 src_idx))
|
|
(u8_from_uimm8 dst_idx)))
|
|
(vec_move_lane_and_insert ty x (be_lane_idx ty dst_idx)
|
|
y (be_lane_idx ty src_idx)))
|
|
|
|
;; Insert vector lane from signed 16-bit immediate.
|
|
(rule (lower (insertlane x @ (value_type ty) (i16_from_value y)
|
|
(u8_from_uimm8 idx)))
|
|
(vec_insert_lane_imm ty x y (be_lane_idx ty idx)))
|
|
|
|
;; Insert vector lane from big-endian memory.
|
|
(rule (lower (insertlane x @ (value_type ty) (sinkable_load y)
|
|
(u8_from_uimm8 idx)))
|
|
(vec_load_lane ty x (sink_load y) (be_lane_idx ty idx)))
|
|
|
|
;; Insert vector lane from little-endian memory.
|
|
(rule (lower (insertlane x @ (value_type ty) (sinkable_load_little y)
|
|
(u8_from_uimm8 idx)))
|
|
(vec_load_lane_little ty x (sink_load y) (be_lane_idx ty idx)))
|
|
|
|
|
|
;; Helper to extract one lane from a vector and insert it into another.
|
|
(decl vec_move_lane_and_insert (Type Reg u8 Reg u8) Reg)
|
|
|
|
;; For 64-bit elements we always use VPDI.
|
|
(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 0 src src_idx)
|
|
(vec_permute_dw_imm ty src src_idx dst 1))
|
|
(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 1 src src_idx)
|
|
(vec_permute_dw_imm ty dst 0 src src_idx))
|
|
|
|
;; If source and destination index are the same, use vec_select.
|
|
(rule (vec_move_lane_and_insert ty dst idx src idx)
|
|
(vec_select ty src
|
|
dst (vec_imm_byte_mask ty (lane_byte_mask ty idx))))
|
|
|
|
;; Otherwise replicate source first and then use vec_select.
|
|
(rule (vec_move_lane_and_insert ty dst dst_idx src src_idx)
|
|
(vec_select ty (vec_replicate_lane ty src src_idx)
|
|
dst (vec_imm_byte_mask ty (lane_byte_mask ty dst_idx))))
|
|
|
|
|
|
;; Helper to implement a generic little-endian variant of vec_load_lane.
|
|
(decl vec_load_lane_little (Type Reg MemArg u8) Reg)
|
|
|
|
;; 8-byte little-endian loads can be performed via a normal load.
|
|
(rule (vec_load_lane_little ty @ (multi_lane 8 _) dst addr lane_imm)
|
|
(vec_load_lane ty dst addr lane_imm))
|
|
|
|
;; On z15, we have instructions to perform little-endian loads.
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 16 _)) dst addr lane_imm)
|
|
(vec_load_lane_rev ty dst addr lane_imm))
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 32 _)) dst addr lane_imm)
|
|
(vec_load_lane_rev ty dst addr lane_imm))
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 64 _)) dst addr lane_imm)
|
|
(vec_load_lane_rev ty dst addr lane_imm))
|
|
|
|
;; On z14, use a little-endian load to GPR followed by vec_insert_lane.
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 16 _)) dst addr lane_imm)
|
|
(vec_insert_lane ty dst (loadrev16 addr) lane_imm (zero_reg)))
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 32 _)) dst addr lane_imm)
|
|
(vec_insert_lane ty dst (loadrev32 addr) lane_imm (zero_reg)))
|
|
(rule (vec_load_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 64 _)) dst addr lane_imm)
|
|
(vec_insert_lane ty dst (loadrev64 addr) lane_imm (zero_reg)))
|
|
|
|
;; Helper to implement a generic little-endian variant of vec_load_lane_undef.
|
|
(decl vec_load_lane_little_undef (Type MemArg u8) Reg)
|
|
|
|
;; 8-byte little-endian loads can be performed via a normal load.
|
|
(rule (vec_load_lane_little_undef ty @ (multi_lane 8 _) addr lane_imm)
|
|
(vec_load_lane_undef ty addr lane_imm))
|
|
|
|
;; On z15, we have instructions to perform little-endian loads.
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 16 _)) addr lane_imm)
|
|
(vec_load_lane_rev_undef ty addr lane_imm))
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 32 _)) addr lane_imm)
|
|
(vec_load_lane_rev_undef ty addr lane_imm))
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 64 _)) addr lane_imm)
|
|
(vec_load_lane_rev_undef ty addr lane_imm))
|
|
|
|
;; On z14, use a little-endian load to GPR followed by vec_insert_lane_undef.
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 16 _)) addr lane_imm)
|
|
(vec_insert_lane_undef ty (loadrev16 addr) lane_imm (zero_reg)))
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 32 _)) addr lane_imm)
|
|
(vec_insert_lane_undef ty (loadrev32 addr) lane_imm (zero_reg)))
|
|
(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 64 _)) addr lane_imm)
|
|
(vec_insert_lane_undef ty (loadrev64 addr) lane_imm (zero_reg)))
|
|
|
|
|
|
;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Extract vector lane to general-purpose register.
|
|
(rule (lower (has_type out_ty
|
|
(extractlane x @ (value_type ty) (u8_from_uimm8 idx))))
|
|
(if (ty_int_bool_ref_scalar_64 out_ty))
|
|
(vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg)))
|
|
|
|
;; Extract vector lane to floating-point register.
|
|
(rule (lower (has_type (ty_scalar_float _)
|
|
(extractlane x @ (value_type ty) (u8_from_uimm8 idx))))
|
|
(vec_replicate_lane ty x (be_lane_idx ty idx)))
|
|
|
|
;; Extract vector lane and store to big-endian memory.
|
|
(rule (lower (store flags @ (bigendian)
|
|
(extractlane x @ (value_type ty) (u8_from_uimm8 idx))
|
|
addr offset))
|
|
(side_effect (vec_store_lane ty x
|
|
(lower_address flags addr offset) (be_lane_idx ty idx))))
|
|
|
|
;; Extract vector lane and store to little-endian memory.
|
|
(rule (lower (store flags @ (littleendian)
|
|
(extractlane x @ (value_type ty) (u8_from_uimm8 idx))
|
|
addr offset))
|
|
(side_effect (vec_store_lane_little ty x
|
|
(lower_address flags addr offset) (be_lane_idx ty idx))))
|
|
|
|
|
|
;; Helper to implement a generic little-endian variant of vec_store_lane.
|
|
(decl vec_store_lane_little (Type Reg MemArg u8) SideEffectNoResult)
|
|
|
|
;; 8-byte little-endian stores can be performed via a normal store.
|
|
(rule (vec_store_lane_little ty @ (multi_lane 8 _) src addr lane_imm)
|
|
(vec_store_lane ty src addr lane_imm))
|
|
|
|
;; On z15, we have instructions to perform little-endian stores.
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 16 _)) src addr lane_imm)
|
|
(vec_store_lane_rev ty src addr lane_imm))
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 32 _)) src addr lane_imm)
|
|
(vec_store_lane_rev ty src addr lane_imm))
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 64 _)) src addr lane_imm)
|
|
(vec_store_lane_rev ty src addr lane_imm))
|
|
|
|
;; On z14, use vec_extract_lane followed by a little-endian store from GPR.
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 16 _)) src addr lane_imm)
|
|
(storerev16 (vec_extract_lane ty src lane_imm (zero_reg)) addr))
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 32 _)) src addr lane_imm)
|
|
(storerev32 (vec_extract_lane ty src lane_imm (zero_reg)) addr))
|
|
(rule (vec_store_lane_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 64 _)) src addr lane_imm)
|
|
(storerev64 (vec_extract_lane ty src lane_imm (zero_reg)) addr))
|
|
|
|
|
|
;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load replicated value from general-purpose register.
|
|
(rule (lower (has_type ty (splat x @ (value_type in_ty))))
|
|
(if (ty_int_bool_ref_scalar_64 in_ty))
|
|
(vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0))
|
|
|
|
;; Load replicated value from floating-point register.
|
|
(rule (lower (has_type ty (splat
|
|
x @ (value_type (ty_scalar_float _)))))
|
|
(vec_replicate_lane ty x 0))
|
|
|
|
;; Load replicated value from vector lane.
|
|
(rule (lower (has_type ty (splat (extractlane x (u8_from_uimm8 idx)))))
|
|
(vec_replicate_lane ty x (be_lane_idx ty idx)))
|
|
|
|
;; Load replicated 16-bit immediate value.
|
|
(rule (lower (has_type ty (splat (i16_from_value x))))
|
|
(vec_imm_replicate ty x))
|
|
|
|
;; Load replicated value from big-endian memory.
|
|
(rule (lower (has_type ty (splat (sinkable_load x))))
|
|
(vec_load_replicate ty (sink_load x)))
|
|
|
|
;; Load replicated value from little-endian memory.
|
|
(rule (lower (has_type ty (splat (sinkable_load_little x))))
|
|
(vec_load_replicate_little ty (sink_load x)))
|
|
|
|
|
|
;; Helper to implement a generic little-endian variant of vec_load_replicate
|
|
(decl vec_load_replicate_little (Type MemArg) Reg)
|
|
|
|
;; 8-byte little-endian loads can be performed via a normal load.
|
|
(rule (vec_load_replicate_little ty @ (multi_lane 8 _) addr)
|
|
(vec_load_replicate ty addr))
|
|
|
|
;; On z15, we have instructions to perform little-endian loads.
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 16 _)) addr)
|
|
(vec_load_replicate_rev ty addr))
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 32 _)) addr)
|
|
(vec_load_replicate_rev ty addr))
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_enabled)
|
|
ty @ (multi_lane 64 _)) addr)
|
|
(vec_load_replicate_rev ty addr))
|
|
|
|
;; On z14, use a little-endian load (via GPR) and replicate.
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 16 _)) addr)
|
|
(vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0))
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 32 _)) addr)
|
|
(vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0))
|
|
(rule (vec_load_replicate_little (and (vxrs_ext2_disabled)
|
|
ty @ (multi_lane 64 _)) addr)
|
|
(vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0))
|
|
|
|
|
|
;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load scalar value from general-purpose register.
|
|
(rule (lower (has_type ty (scalar_to_vector
|
|
x @ (value_type in_ty))))
|
|
(if (ty_int_bool_ref_scalar_64 in_ty))
|
|
(vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg)))
|
|
|
|
;; Load scalar value from floating-point register.
|
|
(rule (lower (has_type ty (scalar_to_vector
|
|
x @ (value_type (ty_scalar_float _)))))
|
|
(vec_move_lane_and_zero ty (be_lane_idx ty 0) x 0))
|
|
|
|
;; Load scalar value from vector lane.
|
|
(rule (lower (has_type ty (scalar_to_vector
|
|
(extractlane x (u8_from_uimm8 idx)))))
|
|
(vec_move_lane_and_zero ty (be_lane_idx ty 0) x (be_lane_idx ty idx)))
|
|
|
|
;; Load scalar 16-bit immediate value.
|
|
(rule (lower (has_type ty (scalar_to_vector (i16_from_value x))))
|
|
(vec_insert_lane_imm ty (vec_imm ty 0) x (be_lane_idx ty 0)))
|
|
|
|
;; Load scalar value from big-endian memory.
|
|
(rule (lower (has_type ty (scalar_to_vector (sinkable_load x))))
|
|
(vec_load_lane ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0)))
|
|
|
|
;; Load scalar value lane from little-endian memory.
|
|
(rule (lower (has_type ty (scalar_to_vector (sinkable_load_little x))))
|
|
(vec_load_lane_little ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0)))
|
|
|
|
|
|
;; Helper to extract one lane from a vector and insert it into a zero vector.
|
|
(decl vec_move_lane_and_zero (Type u8 Reg u8) Reg)
|
|
|
|
;; For 64-bit elements we always use VPDI.
|
|
(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 0 src src_idx)
|
|
(vec_permute_dw_imm ty src src_idx (vec_imm ty 0) 0))
|
|
(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 1 src src_idx)
|
|
(vec_permute_dw_imm ty (vec_imm ty 0) 0 src src_idx))
|
|
|
|
;; If source and destination index are the same, simply mask to this lane.
|
|
(rule (vec_move_lane_and_zero ty idx src idx)
|
|
(vec_and ty src
|
|
(vec_imm_byte_mask ty (lane_byte_mask ty idx))))
|
|
|
|
;; Otherwise replicate source first and then mask to the lane.
|
|
(rule (vec_move_lane_and_zero ty dst_idx src src_idx)
|
|
(vec_and ty (vec_replicate_lane ty src src_idx)
|
|
(vec_imm_byte_mask ty (lane_byte_mask ty dst_idx))))
|
|
|
|
|
|
;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; General case: use vec_permute and then mask off zero lanes.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask and_mask)))
|
|
(vec_and $I8X16 (vec_imm_byte_mask $I8X16 and_mask)
|
|
(vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask))))
|
|
|
|
;; If the pattern has no zero lanes, just a vec_permute suffices.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask)))
|
|
|
|
;; Special patterns that can be implemented via MERGE HIGH.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) permute_mask)
|
|
(vec_merge_high $I64X2 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23) permute_mask)
|
|
(vec_merge_high $I32X4 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23) permute_mask)
|
|
(vec_merge_high $I16X8 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23) permute_mask)
|
|
(vec_merge_high $I8X16 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7) permute_mask)
|
|
(vec_merge_high $I64X2 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7) permute_mask)
|
|
(vec_merge_high $I32X4 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7) permute_mask)
|
|
(vec_merge_high $I16X8 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7) permute_mask)
|
|
(vec_merge_high $I8X16 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7) permute_mask)
|
|
(vec_merge_high $I64X2 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7) permute_mask)
|
|
(vec_merge_high $I32X4 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7) permute_mask)
|
|
(vec_merge_high $I16X8 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7) permute_mask)
|
|
(vec_merge_high $I8X16 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23) permute_mask)
|
|
(vec_merge_high $I64X2 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23) permute_mask)
|
|
(vec_merge_high $I32X4 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23) permute_mask)
|
|
(vec_merge_high $I16X8 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23) permute_mask)
|
|
(vec_merge_high $I8X16 y y))
|
|
|
|
;; Special patterns that can be implemented via MERGE LOW.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) permute_mask)
|
|
(vec_merge_low $I64X2 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31) permute_mask)
|
|
(vec_merge_low $I32X4 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31) permute_mask)
|
|
(vec_merge_low $I16X8 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31) permute_mask)
|
|
(vec_merge_low $I8X16 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15) permute_mask)
|
|
(vec_merge_low $I64X2 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15) permute_mask)
|
|
(vec_merge_low $I32X4 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15) permute_mask)
|
|
(vec_merge_low $I16X8 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15) permute_mask)
|
|
(vec_merge_low $I8X16 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15) permute_mask)
|
|
(vec_merge_low $I64X2 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15) permute_mask)
|
|
(vec_merge_low $I32X4 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15) permute_mask)
|
|
(vec_merge_low $I16X8 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15) permute_mask)
|
|
(vec_merge_low $I8X16 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31) permute_mask)
|
|
(vec_merge_low $I64X2 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31) permute_mask)
|
|
(vec_merge_low $I32X4 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31) permute_mask)
|
|
(vec_merge_low $I16X8 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31) permute_mask)
|
|
(vec_merge_low $I8X16 y y))
|
|
|
|
;; Special patterns that can be implemented via PACK.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31) permute_mask)
|
|
(vec_pack $I64X2 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31) permute_mask)
|
|
(vec_pack $I32X4 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) permute_mask)
|
|
(vec_pack $I16X8 x y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 20 21 22 23 28 29 30 31 4 5 6 7 12 13 14 15) permute_mask)
|
|
(vec_pack $I64X2 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 18 19 22 23 26 27 30 31 2 3 6 7 10 11 14 15) permute_mask)
|
|
(vec_pack $I32X4 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 17 19 21 23 25 27 29 31 1 3 5 7 9 11 13 15) permute_mask)
|
|
(vec_pack $I16X8 y x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15) permute_mask)
|
|
(vec_pack $I64X2 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 2 3 6 7 10 11 14 15 2 3 6 7 10 11 14 15) permute_mask)
|
|
(vec_pack $I32X4 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 1 3 5 7 9 11 13 15 1 3 5 7 9 11 13 15) permute_mask)
|
|
(vec_pack $I16X8 x x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 20 21 22 23 28 29 30 31 20 21 22 23 28 29 30 31) permute_mask)
|
|
(vec_pack $I64X2 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 18 19 22 23 26 27 30 31 18 19 22 23 26 27 30 31) permute_mask)
|
|
(vec_pack $I32X4 y y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 17 19 21 23 25 27 29 31 17 19 21 23 25 27 29 31) permute_mask)
|
|
(vec_pack $I16X8 y y))
|
|
|
|
;; Special patterns that can be implemented via UNPACK HIGH.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 3855)))
|
|
(if-let (imm8x16 _ _ _ _ 0 1 2 3 _ _ _ _ 4 5 6 7) permute_mask)
|
|
(vec_unpacku_high $I32X4 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 13107)))
|
|
(if-let (imm8x16 _ _ 0 1 _ _ 2 3 _ _ 4 5 _ _ 6 7) permute_mask)
|
|
(vec_unpacku_high $I16X8 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 21845)))
|
|
(if-let (imm8x16 _ 0 _ 1 _ 2 _ 3 _ 4 _ 5 _ 6 _ 7) permute_mask)
|
|
(vec_unpacku_high $I8X16 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 3855)))
|
|
(if-let (imm8x16 _ _ _ _ 16 17 18 19 _ _ _ _ 20 21 22 23) permute_mask)
|
|
(vec_unpacku_high $I32X4 y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 13107)))
|
|
(if-let (imm8x16 _ _ 16 17 _ _ 18 19 _ _ 20 21 _ _ 22 23) permute_mask)
|
|
(vec_unpacku_high $I16X8 y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 21845)))
|
|
(if-let (imm8x16 _ 16 _ 17 _ 18 _ 19 _ 20 _ 21 _ 22 _ 23) permute_mask)
|
|
(vec_unpacku_high $I8X16 y))
|
|
|
|
;; Special patterns that can be implemented via UNPACK LOW.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 3855)))
|
|
(if-let (imm8x16 _ _ _ _ 8 9 10 11 _ _ _ _ 12 13 14 15) permute_mask)
|
|
(vec_unpacku_low $I32X4 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 13107)))
|
|
(if-let (imm8x16 _ _ 8 9 _ _ 10 11 _ _ 12 13 _ _ 14 15) permute_mask)
|
|
(vec_unpacku_low $I16X8 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 21845)))
|
|
(if-let (imm8x16 _ 8 _ 9 _ 10 _ 11 _ 12 _ 13 _ 14 _ 15) permute_mask)
|
|
(vec_unpacku_low $I8X16 x))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 3855)))
|
|
(if-let (imm8x16 _ _ _ _ 24 25 26 27 _ _ _ _ 28 29 30 31) permute_mask)
|
|
(vec_unpacku_low $I32X4 y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 13107)))
|
|
(if-let (imm8x16 _ _ 24 25 _ _ 26 27 _ _ 28 29 _ _ 30 31) permute_mask)
|
|
(vec_unpacku_low $I16X8 y))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 21845)))
|
|
(if-let (imm8x16 _ 24 _ 25 _ 26 _ 27 _ 28 _ 29 _ 30 _ 31) permute_mask)
|
|
(vec_unpacku_low $I8X16 y))
|
|
|
|
;; Special patterns that can be implemented via PERMUTE DOUBLEWORD IMMEDIATE.
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 4 5 6 7 24 25 26 27 28 29 30 31) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 x 0 y 1))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 x 1 y 0))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 y 0 x 1))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 28 29 30 31 0 1 2 3 4 5 6 7) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 y 1 x 0))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 x 0 x 1))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 x 1 x 0))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 y 0 y 1))
|
|
(rule (lower (shuffle x y (shuffle_mask permute_mask 65535)))
|
|
(if-let (imm8x16 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23) permute_mask)
|
|
(vec_permute_dw_imm $I8X16 y 1 y 0))
|
|
|
|
|
|
;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; We need to modify the lane mask at runtime in two ways:
|
|
;; - convert from little-endian to big-endian lane numbering
|
|
;; - handle mask elements outside the range 0..15 by zeroing the lane
|
|
;;
|
|
;; To do so efficiently, we compute:
|
|
;; permute-lane-element := umax (239, ~ swizzle-lane-element)
|
|
;; which has the following effect:
|
|
;; elements 0 .. 15 --> 255 .. 240 (i.e. 31 .. 16 mod 32)
|
|
;; everything else --> 239 (i.e. 15 mod 32)
|
|
;;
|
|
;; Then, we can use a single permute instruction with
|
|
;; a zero vector as first operand (covering lane 15)
|
|
;; the input vector as second operand (covering lanes 16 .. 31)
|
|
;; to implement the required swizzle semantics.
|
|
|
|
(rule (lower (has_type (ty_vec128 ty) (swizzle x y)))
|
|
(vec_permute ty (vec_imm ty 0) x
|
|
(vec_umax $I8X16 (vec_imm_splat $I8X16 239)
|
|
(vec_not $I8X16 y))))
|
|
|
|
|
|
;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load the address of a stack slot.
|
|
(rule (lower (has_type ty (stack_addr stack_slot offset)))
|
|
(stack_addr_impl ty stack_slot offset))
|
|
|
|
|
|
;;;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load the address of a function, target reachable via PC-relative instruction.
|
|
(rule (lower (func_addr (func_ref_data _ name (reloc_distance_near))))
|
|
(load_addr (memarg_symbol name 0 (memflags_trusted))))
|
|
|
|
;; Load the address of a function, general case.
|
|
(rule (lower (func_addr (func_ref_data _ name _)))
|
|
(load_ext_name_far name 0))
|
|
|
|
|
|
;;;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load the address of a symbol, target reachable via PC-relative instruction.
|
|
(rule (lower (symbol_value (symbol_value_data name (reloc_distance_near)
|
|
off)))
|
|
(if-let offset (memarg_symbol_offset off))
|
|
(load_addr (memarg_symbol name offset (memflags_trusted))))
|
|
|
|
;; Load the address of a symbol, general case.
|
|
(rule (lower (symbol_value (symbol_value_data name _ offset)))
|
|
(load_ext_name_far name offset))
|
|
|
|
|
|
;;;; Rules for `load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Load 8-bit integers.
|
|
(rule (lower (has_type $I8 (load flags addr offset)))
|
|
(zext32_mem $I8 (lower_address flags addr offset)))
|
|
|
|
;; Load 16-bit big-endian integers.
|
|
(rule (lower (has_type $I16 (load flags @ (bigendian) addr offset)))
|
|
(zext32_mem $I16 (lower_address flags addr offset)))
|
|
|
|
;; Load 16-bit little-endian integers.
|
|
(rule (lower (has_type $I16 (load flags @ (littleendian) addr offset)))
|
|
(loadrev16 (lower_address flags addr offset)))
|
|
|
|
;; Load 32-bit big-endian integers.
|
|
(rule (lower (has_type $I32 (load flags @ (bigendian) addr offset)))
|
|
(load32 (lower_address flags addr offset)))
|
|
|
|
;; Load 32-bit little-endian integers.
|
|
(rule (lower (has_type $I32 (load flags @ (littleendian) addr offset)))
|
|
(loadrev32 (lower_address flags addr offset)))
|
|
|
|
;; Load 64-bit big-endian integers.
|
|
(rule (lower (has_type $I64 (load flags @ (bigendian) addr offset)))
|
|
(load64 (lower_address flags addr offset)))
|
|
|
|
;; Load 64-bit little-endian integers.
|
|
(rule (lower (has_type $I64 (load flags @ (littleendian) addr offset)))
|
|
(loadrev64 (lower_address flags addr offset)))
|
|
|
|
;; Load 64-bit big-endian references.
|
|
(rule (lower (has_type $R64 (load flags @ (bigendian) addr offset)))
|
|
(load64 (lower_address flags addr offset)))
|
|
|
|
;; Load 64-bit little-endian references.
|
|
(rule (lower (has_type $R64 (load flags @ (littleendian) addr offset)))
|
|
(loadrev64 (lower_address flags addr offset)))
|
|
|
|
;; Load 32-bit big-endian floating-point values (as vector lane).
|
|
(rule (lower (has_type $F32 (load flags @ (bigendian) addr offset)))
|
|
(vec_load_lane_undef $F32X4 (lower_address flags addr offset) 0))
|
|
|
|
;; Load 32-bit little-endian floating-point values (as vector lane).
|
|
(rule (lower (has_type $F32 (load flags @ (littleendian) addr offset)))
|
|
(vec_load_lane_little_undef $F32X4 (lower_address flags addr offset) 0))
|
|
|
|
;; Load 64-bit big-endian floating-point values (as vector lane).
|
|
(rule (lower (has_type $F64 (load flags @ (bigendian) addr offset)))
|
|
(vec_load_lane_undef $F64X2 (lower_address flags addr offset) 0))
|
|
|
|
;; Load 64-bit little-endian floating-point values (as vector lane).
|
|
(rule (lower (has_type $F64 (load flags @ (littleendian) addr offset)))
|
|
(vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0))
|
|
|
|
;; Load 128-bit big-endian vector values.
|
|
(rule (lower (has_type (ty_vec128 ty) (load flags @ (bigendian) addr offset)))
|
|
(vec_load ty (lower_address flags addr offset)))
|
|
|
|
;; Load 128-bit little-endian vector values (z15 instruction).
|
|
(rule (lower (has_type (and (vxrs_ext2_enabled) (ty_vec128 ty))
|
|
(load flags @ (littleendian) addr offset)))
|
|
(vec_loadrev ty (lower_address flags addr offset)))
|
|
|
|
;; Load 128-bit little-endian vector values (via GPRs on z14).
|
|
(rule (lower (has_type (and (vxrs_ext2_disabled) (ty_vec128 ty))
|
|
(load flags @ (littleendian) addr offset)))
|
|
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
|
|
(hi_addr MemArg (lower_address_bias flags addr offset 8))
|
|
(lo_val Reg (loadrev64 lo_addr))
|
|
(hi_val Reg (loadrev64 hi_addr)))
|
|
(mov_to_vec128 ty hi_val lo_val)))
|
|
|
|
|
|
;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 16- or 32-bit target types.
|
|
(rule (lower (has_type (gpr32_ty _ty) (uload8 flags addr offset)))
|
|
(zext32_mem $I8 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target types.
|
|
(rule (lower (has_type (gpr64_ty _ty) (uload8 flags addr offset)))
|
|
(zext64_mem $I8 (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for `sload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 16- or 32-bit target types.
|
|
(rule (lower (has_type (gpr32_ty _ty) (sload8 flags addr offset)))
|
|
(sext32_mem $I8 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target types.
|
|
(rule (lower (has_type (gpr64_ty _ty) (sload8 flags addr offset)))
|
|
(sext64_mem $I8 (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for `uload16` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 32-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr32_ty _ty)
|
|
(uload16 flags @ (bigendian) addr offset)))
|
|
(zext32_mem $I16 (lower_address flags addr offset)))
|
|
|
|
;; 32-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr32_ty _ty)
|
|
(uload16 flags @ (littleendian) addr offset)))
|
|
(let ((reg16 Reg (loadrev16 (lower_address flags addr offset))))
|
|
(zext32_reg $I16 reg16)))
|
|
|
|
;; 64-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(uload16 flags @ (bigendian) addr offset)))
|
|
(zext64_mem $I16 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(uload16 flags @ (littleendian) addr offset)))
|
|
(let ((reg16 Reg (loadrev16 (lower_address flags addr offset))))
|
|
(zext64_reg $I16 reg16)))
|
|
|
|
|
|
;;;; Rules for `sload16` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 32-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr32_ty _ty)
|
|
(sload16 flags @ (bigendian) addr offset)))
|
|
(sext32_mem $I16 (lower_address flags addr offset)))
|
|
|
|
;; 32-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr32_ty _ty)
|
|
(sload16 flags @ (littleendian) addr offset)))
|
|
(let ((reg16 Reg (loadrev16 (lower_address flags addr offset))))
|
|
(sext32_reg $I16 reg16)))
|
|
|
|
;; 64-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(sload16 flags @ (bigendian) addr offset)))
|
|
(sext64_mem $I16 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(sload16 flags @ (littleendian) addr offset)))
|
|
(let ((reg16 Reg (loadrev16 (lower_address flags addr offset))))
|
|
(sext64_reg $I16 reg16)))
|
|
|
|
|
|
;;;; Rules for `uload32` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 64-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(uload32 flags @ (bigendian) addr offset)))
|
|
(zext64_mem $I32 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(uload32 flags @ (littleendian) addr offset)))
|
|
(let ((reg32 Reg (loadrev32 (lower_address flags addr offset))))
|
|
(zext64_reg $I32 reg32)))
|
|
|
|
|
|
;;;; Rules for `sload32` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 64-bit target type, big-endian source value.
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(sload32 flags @ (bigendian) addr offset)))
|
|
(sext64_mem $I32 (lower_address flags addr offset)))
|
|
|
|
;; 64-bit target type, little-endian source value (via explicit extension).
|
|
(rule (lower (has_type (gpr64_ty _ty)
|
|
(sload32 flags @ (littleendian) addr offset)))
|
|
(let ((reg32 Reg (loadrev32 (lower_address flags addr offset))))
|
|
(sext64_reg $I32 reg32)))
|
|
|
|
|
|
;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Unsigned 8->16 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset)))
|
|
(vec_unpacku_high $I8X16
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Unsigned 8->16 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset)))
|
|
(vec_unpacku_high $I8X16
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 8->16 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset)))
|
|
(vec_unpacks_high $I8X16
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 8->16 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset)))
|
|
(vec_unpacks_high $I8X16
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Unsigned 16->32 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset)))
|
|
(vec_unpacku_high $I16X8
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Unsigned 16->32 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset)))
|
|
(vec_unpacku_high $I16X8
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 16->32 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset)))
|
|
(vec_unpacks_high $I16X8
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 16->32 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset)))
|
|
(vec_unpacks_high $I16X8
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Unsigned 32->64 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset)))
|
|
(vec_unpacku_high $I32X4
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Unsigned 32->64 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset)))
|
|
(vec_unpacku_high $I32X4
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 32->64 bit extension, big-endian source value.
|
|
(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset)))
|
|
(vec_unpacks_high $I32X4
|
|
(vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
;; Signed 32->64 bit extension, little-endian source value.
|
|
(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset)))
|
|
(vec_unpacks_high $I32X4
|
|
(vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)))
|
|
|
|
|
|
;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; The actual store logic for integer types is identical for the `store`,
|
|
;; `istoreNN`, and `atomic_store` instructions, so we share common helpers.
|
|
|
|
;; Store 8-bit integer type, main lowering entry point.
|
|
(rule (lower (store flags val @ (value_type $I8) addr offset))
|
|
(side_effect (istore8_impl flags val addr offset)))
|
|
|
|
;; Store 16-bit integer type, main lowering entry point.
|
|
(rule (lower (store flags val @ (value_type $I16) addr offset))
|
|
(side_effect (istore16_impl flags val addr offset)))
|
|
|
|
;; Store 32-bit integer type, main lowering entry point.
|
|
(rule (lower (store flags val @ (value_type $I32) addr offset))
|
|
(side_effect (istore32_impl flags val addr offset)))
|
|
|
|
;; Store 64-bit integer type, main lowering entry point.
|
|
(rule (lower (store flags val @ (value_type $I64) addr offset))
|
|
(side_effect (istore64_impl flags val addr offset)))
|
|
|
|
;; Store 64-bit reference type, main lowering entry point.
|
|
(rule (lower (store flags val @ (value_type $R64) addr offset))
|
|
(side_effect (istore64_impl flags val addr offset)))
|
|
|
|
;; Store 32-bit big-endian floating-point type (as vector lane).
|
|
(rule (lower (store flags @ (bigendian)
|
|
val @ (value_type $F32) addr offset))
|
|
(side_effect (vec_store_lane $F32X4 val
|
|
(lower_address flags addr offset) 0)))
|
|
|
|
;; Store 32-bit little-endian floating-point type (as vector lane).
|
|
(rule (lower (store flags @ (littleendian)
|
|
val @ (value_type $F32) addr offset))
|
|
(side_effect (vec_store_lane_little $F32X4 val
|
|
(lower_address flags addr offset) 0)))
|
|
|
|
;; Store 64-bit big-endian floating-point type (as vector lane).
|
|
(rule (lower (store flags @ (bigendian)
|
|
val @ (value_type $F64) addr offset))
|
|
(side_effect (vec_store_lane $F64X2 val
|
|
(lower_address flags addr offset) 0)))
|
|
|
|
;; Store 64-bit little-endian floating-point type (as vector lane).
|
|
(rule (lower (store flags @ (littleendian)
|
|
val @ (value_type $F64) addr offset))
|
|
(side_effect (vec_store_lane_little $F64X2 val
|
|
(lower_address flags addr offset) 0)))
|
|
|
|
;; Store 128-bit big-endian vector type.
|
|
(rule (lower (store flags @ (bigendian)
|
|
val @ (value_type (ty_vec128 ty)) addr offset))
|
|
(side_effect (vec_store val (lower_address flags addr offset))))
|
|
|
|
;; Store 128-bit little-endian vector type (z15 instruction).
|
|
(rule (lower (store flags @ (littleendian)
|
|
val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_enabled))) addr offset))
|
|
(side_effect (vec_storerev val (lower_address flags addr offset))))
|
|
|
|
;; Store 128-bit little-endian vector type (via GPRs on z14).
|
|
(rule (lower (store flags @ (littleendian)
|
|
val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_disabled))) addr offset))
|
|
(let ((lo_addr MemArg (lower_address_bias flags addr offset 0))
|
|
(hi_addr MemArg (lower_address_bias flags addr offset 8))
|
|
(lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg)))
|
|
(hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg))))
|
|
(side_effect (side_effect_concat (storerev64 lo_val lo_addr)
|
|
(storerev64 hi_val hi_addr)))))
|
|
|
|
|
|
;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `istore8` lowering entry point, dispatching to the helper.
|
|
(rule (lower (istore8 flags val addr offset))
|
|
(side_effect (istore8_impl flags val addr offset)))
|
|
|
|
;; Helper to store 8-bit integer types.
|
|
(decl istore8_impl (MemFlags Value Value Offset32) SideEffectNoResult)
|
|
|
|
;; Store 8-bit integer types, register input.
|
|
(rule (istore8_impl flags val addr offset)
|
|
(store8 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
;; Store 8-bit integer types, immediate input.
|
|
(rule (istore8_impl flags (u8_from_value imm) addr offset)
|
|
(store8_imm imm (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for 16-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `istore16` lowering entry point, dispatching to the helper.
|
|
(rule (lower (istore16 flags val addr offset))
|
|
(side_effect (istore16_impl flags val addr offset)))
|
|
|
|
;; Helper to store 16-bit integer types.
|
|
(decl istore16_impl (MemFlags Value Value Offset32) SideEffectNoResult)
|
|
|
|
;; Store 16-bit big-endian integer types, register input.
|
|
(rule (istore16_impl flags @ (bigendian) val addr offset)
|
|
(store16 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
;; Store 16-bit little-endian integer types, register input.
|
|
(rule (istore16_impl flags @ (littleendian) val addr offset)
|
|
(storerev16 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
;; Store 16-bit big-endian integer types, immediate input.
|
|
(rule (istore16_impl flags @ (bigendian) (i16_from_value imm) addr offset)
|
|
(store16_imm imm (lower_address flags addr offset)))
|
|
|
|
;; Store 16-bit little-endian integer types, immediate input.
|
|
(rule (istore16_impl flags @ (littleendian) (i16_from_swapped_value imm) addr offset)
|
|
(store16_imm imm (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for 32-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `istore32` lowering entry point, dispatching to the helper.
|
|
(rule (lower (istore32 flags val addr offset))
|
|
(side_effect (istore32_impl flags val addr offset)))
|
|
|
|
;; Helper to store 32-bit integer types.
|
|
(decl istore32_impl (MemFlags Value Value Offset32) SideEffectNoResult)
|
|
|
|
;; Store 32-bit big-endian integer types, register input.
|
|
(rule (istore32_impl flags @ (bigendian) val addr offset)
|
|
(store32 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
;; Store 32-bit big-endian integer types, immediate input.
|
|
(rule (istore32_impl flags @ (bigendian) (i16_from_value imm) addr offset)
|
|
(store32_simm16 imm (lower_address flags addr offset)))
|
|
|
|
;; Store 32-bit little-endian integer types.
|
|
(rule (istore32_impl flags @ (littleendian) val addr offset)
|
|
(storerev32 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for 64-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Helper to store 64-bit integer types.
|
|
(decl istore64_impl (MemFlags Value Value Offset32) SideEffectNoResult)
|
|
|
|
;; Store 64-bit big-endian integer types, register input.
|
|
(rule (istore64_impl flags @ (bigendian) val addr offset)
|
|
(store64 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
;; Store 64-bit big-endian integer types, immediate input.
|
|
(rule (istore64_impl flags @ (bigendian) (i16_from_value imm) addr offset)
|
|
(store64_simm16 imm (lower_address flags addr offset)))
|
|
|
|
;; Store 64-bit little-endian integer types.
|
|
(rule (istore64_impl flags @ (littleendian) val addr offset)
|
|
(storerev64 (put_in_reg val) (lower_address flags addr offset)))
|
|
|
|
|
|
;;;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Atomic operations that do not require a compare-and-swap loop.
|
|
|
|
;; Atomic AND for 32/64-bit big-endian types, using a single instruction.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (bigendian) (AtomicRmwOp.And) addr src)))
|
|
(atomic_rmw_and ty (put_in_reg src)
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
;; Atomic AND for 32/64-bit big-endian types, using byte-swapped input/output.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (littleendian) (AtomicRmwOp.And) addr src)))
|
|
(bswap_reg ty (atomic_rmw_and ty (bswap_reg ty (put_in_reg src))
|
|
(lower_address flags addr (zero_offset)))))
|
|
|
|
;; Atomic OR for 32/64-bit big-endian types, using a single instruction.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (bigendian) (AtomicRmwOp.Or) addr src)))
|
|
(atomic_rmw_or ty (put_in_reg src)
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
;; Atomic OR for 32/64-bit little-endian types, using byte-swapped input/output.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (littleendian) (AtomicRmwOp.Or) addr src)))
|
|
(bswap_reg ty (atomic_rmw_or ty (bswap_reg ty (put_in_reg src))
|
|
(lower_address flags addr (zero_offset)))))
|
|
|
|
;; Atomic XOR for 32/64-bit big-endian types, using a single instruction.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (bigendian) (AtomicRmwOp.Xor) addr src)))
|
|
(atomic_rmw_xor ty (put_in_reg src)
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
;; Atomic XOR for 32/64-bit little-endian types, using byte-swapped input/output.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (littleendian) (AtomicRmwOp.Xor) addr src)))
|
|
(bswap_reg ty (atomic_rmw_xor ty (bswap_reg ty (put_in_reg src))
|
|
(lower_address flags addr (zero_offset)))))
|
|
|
|
;; Atomic ADD for 32/64-bit big-endian types, using a single instruction.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (bigendian) (AtomicRmwOp.Add) addr src)))
|
|
(atomic_rmw_add ty (put_in_reg src)
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
;; Atomic SUB for 32/64-bit big-endian types, using atomic ADD with negated input.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_rmw flags @ (bigendian) (AtomicRmwOp.Sub) addr src)))
|
|
(atomic_rmw_add ty (neg_reg ty (put_in_reg src))
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
|
|
;; Atomic operations that require a compare-and-swap loop.
|
|
|
|
;; Operations for 32/64-bit types can use a fullword compare-and-swap loop.
|
|
(rule (lower (has_type (ty_32_or_64 ty) (atomic_rmw flags op addr src)))
|
|
(let ((src_reg Reg (put_in_reg src))
|
|
(addr_reg Reg (put_in_reg addr))
|
|
;; Create body of compare-and-swap loop.
|
|
(ib VecMInstBuilder (inst_builder_new))
|
|
(val0 Reg (writable_reg_to_reg (casloop_val_reg)))
|
|
(val1 Reg (atomic_rmw_body ib ty flags op
|
|
(casloop_tmp_reg) val0 src_reg)))
|
|
;; Emit compare-and-swap loop and extract final result.
|
|
(casloop ib ty flags addr_reg val1)))
|
|
|
|
;; Operations for 8/16-bit types must operate on the surrounding aligned word.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (atomic_rmw flags op addr src)))
|
|
(let ((src_reg Reg (put_in_reg src))
|
|
(addr_reg Reg (put_in_reg addr))
|
|
;; Prepare access to surrounding aligned word.
|
|
(bitshift Reg (casloop_bitshift addr_reg))
|
|
(aligned_addr Reg (casloop_aligned_addr addr_reg))
|
|
;; Create body of compare-and-swap loop.
|
|
(ib VecMInstBuilder (inst_builder_new))
|
|
(val0 Reg (writable_reg_to_reg (casloop_val_reg)))
|
|
(val1 Reg (casloop_rotate_in ib ty flags bitshift val0))
|
|
(val2 Reg (atomic_rmw_body ib ty flags op
|
|
(casloop_tmp_reg) val1 src_reg))
|
|
(val3 Reg (casloop_rotate_out ib ty flags bitshift val2)))
|
|
;; Emit compare-and-swap loop and extract final result.
|
|
(casloop_subword ib ty flags aligned_addr bitshift val3)))
|
|
|
|
;; Loop bodies for atomic read-modify-write operations.
|
|
(decl atomic_rmw_body (VecMInstBuilder Type MemFlags AtomicRmwOp
|
|
WritableReg Reg Reg) Reg)
|
|
|
|
;; Loop bodies for 32-/64-bit atomic XCHG operations.
|
|
;; Simply use the source (possibly byte-swapped) as new target value.
|
|
(rule (atomic_rmw_body ib (ty_32_or_64 ty) (bigendian)
|
|
(AtomicRmwOp.Xchg) tmp val src)
|
|
src)
|
|
(rule (atomic_rmw_body ib (ty_32_or_64 ty) (littleendian)
|
|
(AtomicRmwOp.Xchg) tmp val src)
|
|
(bswap_reg ty src))
|
|
|
|
;; Loop bodies for 32-/64-bit atomic NAND operations.
|
|
;; On z15 this can use the NN(G)RK instruction. On z14, perform an And
|
|
;; operation and invert the result. In the little-endian case, we can
|
|
;; simply byte-swap the source operand.
|
|
(rule (atomic_rmw_body ib (and (mie2_enabled) (ty_32_or_64 ty)) (bigendian)
|
|
(AtomicRmwOp.Nand) tmp val src)
|
|
(push_alu_reg ib (aluop_not_and ty) tmp val src))
|
|
(rule (atomic_rmw_body ib (and (mie2_enabled) (ty_32_or_64 ty)) (littleendian)
|
|
(AtomicRmwOp.Nand) tmp val src)
|
|
(push_alu_reg ib (aluop_not_and ty) tmp val (bswap_reg ty src)))
|
|
(rule (atomic_rmw_body ib (and (mie2_disabled) (ty_32_or_64 ty)) (bigendian)
|
|
(AtomicRmwOp.Nand) tmp val src)
|
|
(push_not_reg ib ty tmp
|
|
(push_alu_reg ib (aluop_and ty) tmp val src)))
|
|
(rule (atomic_rmw_body ib (and (mie2_disabled) (ty_32_or_64 ty)) (littleendian)
|
|
(AtomicRmwOp.Nand) tmp val src)
|
|
(push_not_reg ib ty tmp
|
|
(push_alu_reg ib (aluop_and ty) tmp val (bswap_reg ty src))))
|
|
|
|
;; Loop bodies for 8-/16-bit atomic bit operations.
|
|
;; These use the "rotate-then-<op>-selected bits" family of instructions.
|
|
;; For the Nand operation, we again perform And and invert the result.
|
|
(rule (atomic_rmw_body ib (ty_8_or_16 ty) flags (AtomicRmwOp.Xchg) tmp val src)
|
|
(atomic_rmw_body_rxsbg ib ty flags (RxSBGOp.Insert) tmp val src))
|
|
(rule (atomic_rmw_body ib (ty_8_or_16 ty) flags (AtomicRmwOp.And) tmp val src)
|
|
(atomic_rmw_body_rxsbg ib ty flags (RxSBGOp.And) tmp val src))
|
|
(rule (atomic_rmw_body ib (ty_8_or_16 ty) flags (AtomicRmwOp.Or) tmp val src)
|
|
(atomic_rmw_body_rxsbg ib ty flags (RxSBGOp.Or) tmp val src))
|
|
(rule (atomic_rmw_body ib (ty_8_or_16 ty) flags (AtomicRmwOp.Xor) tmp val src)
|
|
(atomic_rmw_body_rxsbg ib ty flags (RxSBGOp.Xor) tmp val src))
|
|
(rule (atomic_rmw_body ib (ty_8_or_16 ty) flags (AtomicRmwOp.Nand) tmp val src)
|
|
(atomic_rmw_body_invert ib ty flags tmp
|
|
(atomic_rmw_body_rxsbg ib ty flags (RxSBGOp.And) tmp val src)))
|
|
|
|
;; RxSBG subword operation.
|
|
(decl atomic_rmw_body_rxsbg (VecMInstBuilder Type MemFlags RxSBGOp
|
|
WritableReg Reg Reg) Reg)
|
|
;; 8-bit case: use the low byte of "src" and the high byte of "val".
|
|
(rule (atomic_rmw_body_rxsbg ib $I8 _ op tmp val src)
|
|
(push_rxsbg ib op tmp val src 32 40 24))
|
|
;; 16-bit big-endian case: use the low two bytes of "src" and the
|
|
;; high two bytes of "val".
|
|
(rule (atomic_rmw_body_rxsbg ib $I16 (bigendian) op tmp val src)
|
|
(push_rxsbg ib op tmp val src 32 48 16))
|
|
;; 16-bit little-endian case: use the low two bytes of "src", byte-swapped
|
|
;; so they end up in the high two bytes, and the low two bytes of "val".
|
|
(rule (atomic_rmw_body_rxsbg ib $I16 (littleendian) op tmp val src)
|
|
(push_rxsbg ib op tmp val (bswap_reg $I32 src) 48 64 -16))
|
|
|
|
;; Invert a subword.
|
|
(decl atomic_rmw_body_invert (VecMInstBuilder Type MemFlags WritableReg Reg) Reg)
|
|
;; 8-bit case: invert the high byte.
|
|
(rule (atomic_rmw_body_invert ib $I8 _ tmp val)
|
|
(push_xor_uimm32shifted ib $I32 tmp val (uimm32shifted 0xff000000 0)))
|
|
;; 16-bit big-endian case: invert the two high bytes.
|
|
(rule (atomic_rmw_body_invert ib $I16 (bigendian) tmp val)
|
|
(push_xor_uimm32shifted ib $I32 tmp val (uimm32shifted 0xffff0000 0)))
|
|
;; 16-bit little-endian case: invert the two low bytes.
|
|
(rule (atomic_rmw_body_invert ib $I16 (littleendian) tmp val)
|
|
(push_xor_uimm32shifted ib $I32 tmp val (uimm32shifted 0xffff 0)))
|
|
|
|
;; Loop bodies for atomic ADD/SUB operations.
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Add) tmp val src)
|
|
(atomic_rmw_body_addsub ib ty flags (aluop_add (ty_ext32 ty)) tmp val src))
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Sub) tmp val src)
|
|
(atomic_rmw_body_addsub ib ty flags (aluop_sub (ty_ext32 ty)) tmp val src))
|
|
|
|
;; Addition or subtraction operation.
|
|
(decl atomic_rmw_body_addsub (VecMInstBuilder Type MemFlags ALUOp
|
|
WritableReg Reg Reg) Reg)
|
|
;; 32/64-bit big-endian case: just a regular add/sub operation.
|
|
(rule (atomic_rmw_body_addsub ib (ty_32_or_64 ty) (bigendian) op tmp val src)
|
|
(push_alu_reg ib op tmp val src))
|
|
;; 32/64-bit little-endian case: byte-swap the value loaded from memory before
|
|
;; and after performing the operation in native endianness.
|
|
(rule (atomic_rmw_body_addsub ib (ty_32_or_64 ty) (littleendian) op tmp val src)
|
|
(let ((val_swapped Reg (push_bswap_reg ib ty tmp val))
|
|
(res_swapped Reg (push_alu_reg ib op tmp val_swapped src)))
|
|
(push_bswap_reg ib ty tmp res_swapped)))
|
|
;; 8-bit case: perform a 32-bit addition of the source value shifted by 24 bits
|
|
;; to the memory value, which contains the target in its high byte.
|
|
(rule (atomic_rmw_body_addsub ib $I8 _ op tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 24)))
|
|
(push_alu_reg ib op tmp val src_shifted)))
|
|
;; 16-bit big-endian case: similar, just shift the source by 16 bits.
|
|
(rule (atomic_rmw_body_addsub ib $I16 (bigendian) op tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 16)))
|
|
(push_alu_reg ib op tmp val src_shifted)))
|
|
;; 16-bit little-endian case: the same, but in addition we need to byte-swap
|
|
;; the memory value before and after the operation. Since the value was placed
|
|
;; in the low two bytes by our standard rotation, we can use a 32-bit byte-swap
|
|
;; and the native-endian value will end up in the high bytes where we need it
|
|
;; to perform the operation.
|
|
(rule (atomic_rmw_body_addsub ib $I16 (littleendian) op tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 16))
|
|
(val_swapped Reg (push_bswap_reg ib $I32 tmp val))
|
|
(res_swapped Reg (push_alu_reg ib op tmp val_swapped src_shifted)))
|
|
(push_bswap_reg ib $I32 tmp res_swapped)))
|
|
|
|
;; Loop bodies for atomic MIN/MAX operations.
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Smin) tmp val src)
|
|
(atomic_rmw_body_minmax ib ty flags (cmpop_cmps (ty_ext32 ty))
|
|
(intcc_as_cond (IntCC.SignedLessThan)) tmp val src))
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Smax) tmp val src)
|
|
(atomic_rmw_body_minmax ib ty flags (cmpop_cmps (ty_ext32 ty))
|
|
(intcc_as_cond (IntCC.SignedGreaterThan)) tmp val src))
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Umin) tmp val src)
|
|
(atomic_rmw_body_minmax ib ty flags (cmpop_cmpu (ty_ext32 ty))
|
|
(intcc_as_cond (IntCC.UnsignedLessThan)) tmp val src))
|
|
(rule (atomic_rmw_body ib ty flags (AtomicRmwOp.Umax) tmp val src)
|
|
(atomic_rmw_body_minmax ib ty flags (cmpop_cmpu (ty_ext32 ty))
|
|
(intcc_as_cond (IntCC.UnsignedGreaterThan)) tmp val src))
|
|
|
|
;; Minimum or maximum operation.
|
|
(decl atomic_rmw_body_minmax (VecMInstBuilder Type MemFlags CmpOp Cond
|
|
WritableReg Reg Reg) Reg)
|
|
;; 32/64-bit big-endian case: just a comparison followed by a conditional
|
|
;; break out of the loop if the memory value does not need to change.
|
|
;; If it does need to change, the new value is simply the source operand.
|
|
(rule (atomic_rmw_body_minmax ib (ty_32_or_64 ty) (bigendian)
|
|
op cond tmp val src)
|
|
(let ((_ Reg (push_break_if ib (cmp_rr op src val) (invert_cond cond))))
|
|
src))
|
|
;; 32/64-bit little-endian case: similar, but we need to byte-swap the
|
|
;; memory value before the comparison. If we need to store the new value,
|
|
;; it also needs to be byte-swapped.
|
|
(rule (atomic_rmw_body_minmax ib (ty_32_or_64 ty) (littleendian)
|
|
op cond tmp val src)
|
|
(let ((val_swapped Reg (push_bswap_reg ib ty tmp val))
|
|
(_ Reg (push_break_if ib (cmp_rr op src val_swapped)
|
|
(invert_cond cond))))
|
|
(push_bswap_reg ib ty tmp src)))
|
|
;; 8-bit case: compare the memory value (which contains the target in the
|
|
;; high byte) with the source operand shifted by 24 bits. Note that in
|
|
;; the case where the high bytes are equal, the comparison may succeed
|
|
;; or fail depending on the unrelated low bits of the memory value, and
|
|
;; so we either may or may not perform the update. But it would be an
|
|
;; update with the same value in any case, so this does not matter.
|
|
(rule (atomic_rmw_body_minmax ib $I8 _ op cond tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 24))
|
|
(_ Reg (push_break_if ib (cmp_rr op src_shifted val)
|
|
(invert_cond cond))))
|
|
(push_rxsbg ib (RxSBGOp.Insert) tmp val src_shifted 32 40 0)))
|
|
;; 16-bit big-endian case: similar, just shift the source by 16 bits.
|
|
(rule (atomic_rmw_body_minmax ib $I16 (bigendian) op cond tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 16))
|
|
(_ Reg (push_break_if ib (cmp_rr op src_shifted val)
|
|
(invert_cond cond))))
|
|
(push_rxsbg ib (RxSBGOp.Insert) tmp val src_shifted 32 48 0)))
|
|
;; 16-bit little-endian case: similar, but in addition byte-swap the
|
|
;; memory value before and after the operation, like for _addsub_.
|
|
(rule (atomic_rmw_body_minmax ib $I16 (littleendian) op cond tmp val src)
|
|
(let ((src_shifted Reg (lshl_imm $I32 src 16))
|
|
(val_swapped Reg (push_bswap_reg ib $I32 tmp val))
|
|
(_ Reg (push_break_if ib (cmp_rr op src_shifted val_swapped)
|
|
(invert_cond cond)))
|
|
(res_swapped Reg (push_rxsbg ib (RxSBGOp.Insert)
|
|
tmp val_swapped src_shifted 32 48 0)))
|
|
(push_bswap_reg ib $I32 tmp res_swapped)))
|
|
|
|
|
|
;;;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; 32/64-bit big-endian atomic compare-and-swap instruction.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_cas flags @ (bigendian) addr src1 src2)))
|
|
(atomic_cas_impl ty (put_in_reg src1) (put_in_reg src2)
|
|
(lower_address flags addr (zero_offset))))
|
|
|
|
;; 32/64-bit little-endian atomic compare-and-swap instruction.
|
|
;; Implemented by byte-swapping old/new inputs and the output.
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(atomic_cas flags @ (littleendian) addr src1 src2)))
|
|
(bswap_reg ty (atomic_cas_impl ty (bswap_reg ty (put_in_reg src1))
|
|
(bswap_reg ty (put_in_reg src2))
|
|
(lower_address flags addr (zero_offset)))))
|
|
|
|
;; 8/16-bit atomic compare-and-swap implemented via loop.
|
|
(rule (lower (has_type (ty_8_or_16 ty) (atomic_cas flags addr src1 src2)))
|
|
(let ((src1_reg Reg (put_in_reg src1))
|
|
(src2_reg Reg (put_in_reg src2))
|
|
(addr_reg Reg (put_in_reg addr))
|
|
;; Prepare access to the surrounding aligned word.
|
|
(bitshift Reg (casloop_bitshift addr_reg))
|
|
(aligned_addr Reg (casloop_aligned_addr addr_reg))
|
|
;; Create body of compare-and-swap loop.
|
|
(ib VecMInstBuilder (inst_builder_new))
|
|
(val0 Reg (writable_reg_to_reg (casloop_val_reg)))
|
|
(val1 Reg (casloop_rotate_in ib ty flags bitshift val0))
|
|
(val2 Reg (atomic_cas_body ib ty flags
|
|
(casloop_tmp_reg) val1 src1_reg src2_reg))
|
|
(val3 Reg (casloop_rotate_out ib ty flags bitshift val2)))
|
|
;; Emit compare-and-swap loop and extract final result.
|
|
(casloop_subword ib ty flags aligned_addr bitshift val3)))
|
|
|
|
;; Emit loop body instructions to perform a subword compare-and-swap.
|
|
(decl atomic_cas_body (VecMInstBuilder Type MemFlags
|
|
WritableReg Reg Reg Reg) Reg)
|
|
|
|
;; 8-bit case: "val" contains the value loaded from memory in the high byte.
|
|
;; Compare with the comparison value in the low byte of "src1". If unequal,
|
|
;; break out of the loop, otherwise replace the target byte in "val" with
|
|
;; the low byte of "src2".
|
|
(rule (atomic_cas_body ib $I8 _ tmp val src1 src2)
|
|
(let ((_ Reg (push_break_if ib (rxsbg_test (RxSBGOp.Xor) val src1 32 40 24)
|
|
(intcc_as_cond (IntCC.NotEqual)))))
|
|
(push_rxsbg ib (RxSBGOp.Insert) tmp val src2 32 40 24)))
|
|
|
|
;; 16-bit big-endian case: Same as above, except with values in the high
|
|
;; two bytes of "val" and low two bytes of "src1" and "src2".
|
|
(rule (atomic_cas_body ib $I16 (bigendian) tmp val src1 src2)
|
|
(let ((_ Reg (push_break_if ib (rxsbg_test (RxSBGOp.Xor) val src1 32 48 16)
|
|
(intcc_as_cond (IntCC.NotEqual)))))
|
|
(push_rxsbg ib (RxSBGOp.Insert) tmp val src2 32 48 16)))
|
|
|
|
;; 16-bit little-endian case: "val" here contains a little-endian value in the
|
|
;; *low* two bytes. "src1" and "src2" contain native (i.e. big-endian) values
|
|
;; in their low two bytes. Perform the operation in little-endian mode by
|
|
;; byte-swapping "src1" and "src" ahead of the loop. Note that this is a
|
|
;; 32-bit operation so the little-endian 16-bit values end up in the *high*
|
|
;; two bytes of the swapped values.
|
|
(rule (atomic_cas_body ib $I16 (littleendian) tmp val src1 src2)
|
|
(let ((src1_swapped Reg (bswap_reg $I32 src1))
|
|
(src2_swapped Reg (bswap_reg $I32 src2))
|
|
(_ Reg (push_break_if ib
|
|
(rxsbg_test (RxSBGOp.Xor) val src1_swapped 48 64 -16)
|
|
(intcc_as_cond (IntCC.NotEqual)))))
|
|
(push_rxsbg ib (RxSBGOp.Insert) tmp val src2_swapped 48 64 -16)))
|
|
|
|
|
|
;;;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Atomic loads can be implemented via regular loads on this platform.
|
|
|
|
;; 8-bit atomic load.
|
|
(rule (lower (has_type $I8 (atomic_load flags addr)))
|
|
(zext32_mem $I8 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 16-bit big-endian atomic load.
|
|
(rule (lower (has_type $I16 (atomic_load flags @ (bigendian) addr)))
|
|
(zext32_mem $I16 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 16-bit little-endian atomic load.
|
|
(rule (lower (has_type $I16 (atomic_load flags @ (littleendian) addr)))
|
|
(loadrev16 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 32-bit big-endian atomic load.
|
|
(rule (lower (has_type $I32 (atomic_load flags @ (bigendian) addr)))
|
|
(load32 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 32-bit little-endian atomic load.
|
|
(rule (lower (has_type $I32 (atomic_load flags @ (littleendian) addr)))
|
|
(loadrev32 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 64-bit big-endian atomic load.
|
|
(rule (lower (has_type $I64 (atomic_load flags @ (bigendian) addr)))
|
|
(load64 (lower_address flags addr (zero_offset))))
|
|
|
|
;; 64-bit little-endian atomic load.
|
|
(rule (lower (has_type $I64 (atomic_load flags @ (littleendian) addr)))
|
|
(loadrev64 (lower_address flags addr (zero_offset))))
|
|
|
|
|
|
;;;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Atomic stores can be implemented via regular stores followed by a fence.
|
|
(decl atomic_store_impl (SideEffectNoResult) InstOutput)
|
|
(rule (atomic_store_impl store)
|
|
(let ((_ InstOutput (side_effect store)))
|
|
(side_effect (fence_impl))))
|
|
|
|
;; 8-bit atomic store.
|
|
(rule (lower (atomic_store flags val @ (value_type $I8) addr))
|
|
(atomic_store_impl (istore8_impl flags val addr (zero_offset))))
|
|
|
|
;; 16-bit atomic store.
|
|
(rule (lower (atomic_store flags val @ (value_type $I16) addr))
|
|
(atomic_store_impl (istore16_impl flags val addr (zero_offset))))
|
|
|
|
;; 32-bit atomic store.
|
|
(rule (lower (atomic_store flags val @ (value_type $I32) addr))
|
|
(atomic_store_impl (istore32_impl flags val addr (zero_offset))))
|
|
|
|
;; 64-bit atomic store.
|
|
(rule (lower (atomic_store flags val @ (value_type $I64) addr))
|
|
(atomic_store_impl (istore64_impl flags val addr (zero_offset))))
|
|
|
|
|
|
;;;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Fence to ensure sequential consistency.
|
|
(rule (lower (fence))
|
|
(side_effect (fence_impl)))
|
|
|
|
|
|
;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; We want to optimize the typical use of `icmp` (generating an integer 0/1
|
|
;; result) followed by some user, like a `select` or a conditional branch.
|
|
;; Instead of first generating the integer result and later testing it again,
|
|
;; we want to sink the comparison to be performed at the site of use.
|
|
;;
|
|
;; To enable this, we provide generic helpers that return a `ProducesBool`
|
|
;; encapsulating the comparison in question, which can be used by all the
|
|
;; above scenarios.
|
|
;;
|
|
;; N.B. There are specific considerations when sinking a memory load into a
|
|
;; comparison. When emitting an `icmp` directly, this can of course be done
|
|
;; as usual. However, when we use the `ProducesBool` elsewhere, we need to
|
|
;; consider *three* instructions: the load, the `icmp`, and the final user
|
|
;; (e.g. a conditional branch). The only way to safely sink the load would
|
|
;; be to sink it direct into the final user, which is only possible if there
|
|
;; is no *other* user of the `icmp` result. This is not currently being
|
|
;; verified by the `SinkableInst` logic, so to be safe we do not perform this
|
|
;; optimization at all.
|
|
;;
|
|
;; The generic `icmp_val` helper therefore has a flag indicating whether
|
|
;; it is being invoked in a context where it is safe to sink memory loads
|
|
;; (e.g. when directly emitting an `icmp`), or whether it is not (e.g. when
|
|
;; sinking the `icmp` result into a conditional branch or select).
|
|
|
|
;; Main `icmp` entry point. Generate a `ProducesBool` capturing the
|
|
;; integer comparison and immediately lower it to a 0/1 integer result.
|
|
;; In this case, it is safe to sink memory loads.
|
|
(rule (lower (has_type (fits_in_64 ty) (icmp int_cc x y)))
|
|
(lower_bool ty (icmp_val $true int_cc x y)))
|
|
|
|
|
|
;; Return a `ProducesBool` to implement any integer comparison.
|
|
;; The first argument is a flag to indicate whether it is safe to sink
|
|
;; memory loads as discussed above.
|
|
(decl icmp_val (bool IntCC Value Value) ProducesBool)
|
|
|
|
;; Dispatch for signed comparisons.
|
|
(rule (icmp_val allow_mem int_cc @ (signed) x y)
|
|
(bool (icmps_val allow_mem x y) (intcc_as_cond int_cc)))
|
|
;; Dispatch for unsigned comparisons.
|
|
(rule (icmp_val allow_mem int_cc @ (unsigned) x y)
|
|
(bool (icmpu_val allow_mem x y) (intcc_as_cond int_cc)))
|
|
|
|
|
|
;; Return a `ProducesBool` to implement signed integer comparisons.
|
|
(decl icmps_val (bool Value Value) ProducesFlags)
|
|
|
|
;; Compare (signed) two registers.
|
|
(rule (icmps_val _ x @ (value_type (fits_in_64 ty)) y)
|
|
(icmps_reg (ty_ext32 ty) (put_in_reg_sext32 x) (put_in_reg_sext32 y)))
|
|
|
|
;; Compare (signed) a register and a sign-extended register.
|
|
(rule (icmps_val _ x @ (value_type (fits_in_64 ty)) (sext32_value y))
|
|
(icmps_reg_sext32 ty x y))
|
|
|
|
;; Compare (signed) a register and an immediate.
|
|
(rule (icmps_val _ x @ (value_type (fits_in_64 ty)) (i16_from_value y))
|
|
(icmps_simm16 (ty_ext32 ty) (put_in_reg_sext32 x) y))
|
|
(rule (icmps_val _ x @ (value_type (fits_in_64 ty)) (i32_from_value y))
|
|
(icmps_simm32 (ty_ext32 ty) (put_in_reg_sext32 x) y))
|
|
|
|
;; Compare (signed) a register and memory (32/64-bit types).
|
|
(rule (icmps_val $true x @ (value_type (fits_in_64 ty)) (sinkable_load_32_64 y))
|
|
(icmps_mem ty x (sink_load y)))
|
|
|
|
;; Compare (signed) a register and memory (16-bit types).
|
|
(rule (icmps_val $true x @ (value_type (fits_in_64 ty)) (sinkable_load_16 y))
|
|
(icmps_mem_sext16 (ty_ext32 ty) (put_in_reg_sext32 x) (sink_load y)))
|
|
|
|
;; Compare (signed) a register and sign-extended memory.
|
|
(rule (icmps_val $true x @ (value_type (fits_in_64 ty)) (sinkable_sload16 y))
|
|
(icmps_mem_sext16 ty x (sink_sload16 y)))
|
|
(rule (icmps_val $true x @ (value_type (fits_in_64 ty)) (sinkable_sload32 y))
|
|
(icmps_mem_sext32 ty x (sink_sload32 y)))
|
|
|
|
|
|
;; Return a `ProducesBool` to implement unsigned integer comparisons.
|
|
(decl icmpu_val (bool Value Value) ProducesFlags)
|
|
|
|
;; Compare (unsigned) two registers.
|
|
(rule (icmpu_val _ x @ (value_type (fits_in_64 ty)) y)
|
|
(icmpu_reg (ty_ext32 ty) (put_in_reg_zext32 x) (put_in_reg_zext32 y)))
|
|
|
|
;; Compare (unsigned) a register and a sign-extended register.
|
|
(rule (icmpu_val _ x @ (value_type (fits_in_64 ty)) (zext32_value y))
|
|
(icmpu_reg_zext32 ty x y))
|
|
|
|
;; Compare (unsigned) a register and an immediate.
|
|
(rule (icmpu_val _ x @ (value_type (fits_in_64 ty)) (u32_from_value y))
|
|
(icmpu_uimm32 (ty_ext32 ty) (put_in_reg_zext32 x) y))
|
|
|
|
;; Compare (unsigned) a register and memory (32/64-bit types).
|
|
(rule (icmpu_val $true x @ (value_type (fits_in_64 ty)) (sinkable_load_32_64 y))
|
|
(icmpu_mem ty x (sink_load y)))
|
|
|
|
;; Compare (unsigned) a register and memory (16-bit types).
|
|
;; Note that the ISA only provides instructions with a PC-relative memory
|
|
;; address here, so we need to check whether the sinkable load matches this.
|
|
(rule (icmpu_val $true x @ (value_type (fits_in_64 ty))
|
|
(sinkable_load_16 ld))
|
|
(if-let y (load_sym ld))
|
|
(icmpu_mem_zext16 (ty_ext32 ty) (put_in_reg_zext32 x) (sink_load y)))
|
|
|
|
;; Compare (unsigned) a register and zero-extended memory.
|
|
;; Note that the ISA only provides instructions with a PC-relative memory
|
|
;; address here, so we need to check whether the sinkable load matches this.
|
|
(rule (icmpu_val $true x @ (value_type (fits_in_64 ty))
|
|
(sinkable_uload16 ld))
|
|
(if-let y (uload16_sym ld))
|
|
(icmpu_mem_zext16 ty x (sink_uload16 y)))
|
|
(rule (icmpu_val $true x @ (value_type (fits_in_64 ty)) (sinkable_uload32 y))
|
|
(icmpu_mem_zext32 ty x (sink_uload32 y)))
|
|
|
|
;; Vector `icmp` produces a boolean vector.
|
|
;; We need to handle the various IntCC flags separately here.
|
|
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.Equal) x y)))
|
|
(vec_cmpeq ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.NotEqual) x y)))
|
|
(vec_not ty (vec_cmpeq ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThan) x y)))
|
|
(vec_cmph ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThanOrEqual) x y)))
|
|
(vec_not ty (vec_cmph ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThan) x y)))
|
|
(vec_cmph ty y x))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThanOrEqual) x y)))
|
|
(vec_not ty (vec_cmph ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThan) x y)))
|
|
(vec_cmphl ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThanOrEqual) x y)))
|
|
(vec_not ty (vec_cmphl ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThan) x y)))
|
|
(vec_cmphl ty y x))
|
|
(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThanOrEqual) x y)))
|
|
(vec_not ty (vec_cmphl ty y x)))
|
|
|
|
|
|
;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `fcmp` entry point. Generate a `ProducesBool` capturing the
|
|
;; integer comparison and immediately lower it to a 0/1 integer result.
|
|
(rule (lower (has_type (fits_in_64 ty) (fcmp float_cc x y)))
|
|
(lower_bool ty (fcmp_val float_cc x y)))
|
|
|
|
;; Return a `ProducesBool` to implement any floating-point comparison.
|
|
(decl fcmp_val (FloatCC Value Value) ProducesBool)
|
|
(rule (fcmp_val float_cc x @ (value_type ty) y)
|
|
(bool (fcmp_reg ty x y)
|
|
(floatcc_as_cond float_cc)))
|
|
|
|
;; Vector `fcmp` produces a boolean vector.
|
|
;; We need to handle the various FloatCC flags separately here.
|
|
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Equal) x y)))
|
|
(vec_fcmpeq ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.NotEqual) x y)))
|
|
(vec_not ty (vec_fcmpeq ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThan) x y)))
|
|
(vec_fcmph ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y)))
|
|
(vec_not ty (vec_fcmph ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThanOrEqual) x y)))
|
|
(vec_fcmphe ty x y))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThan) x y)))
|
|
(vec_not ty (vec_fcmphe ty x y)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThan) x y)))
|
|
(vec_fcmph ty y x))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y)))
|
|
(vec_not ty (vec_fcmph ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThanOrEqual) x y)))
|
|
(vec_fcmphe ty y x))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThan) x y)))
|
|
(vec_not ty (vec_fcmphe ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Ordered) x y)))
|
|
(vec_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Unordered) x y)))
|
|
(vec_not_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.OrderedNotEqual) x y)))
|
|
(vec_or ty (vec_fcmph ty x y) (vec_fcmph ty y x)))
|
|
(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrEqual) x y)))
|
|
(vec_not_or ty (vec_fcmph ty x y) (vec_fcmph ty y x)))
|
|
|
|
|
|
;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `vall_true` entry point. Generate a `ProducesBool` capturing the
|
|
;; comparison and immediately lower it to a 0/1 integer result.
|
|
(rule (lower (has_type (fits_in_64 ty) (vall_true x)))
|
|
(lower_bool ty (vall_true_val x)))
|
|
|
|
;; Return a `ProducesBool` to implement `vall_true`.
|
|
(decl vall_true_val (Value) ProducesBool)
|
|
(rule (vall_true_val x @ (value_type ty))
|
|
(bool (vec_cmpeqs ty x (vec_imm ty 0))
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
|
|
;; Short-circuit `vall_true` on the result of a `icmp`.
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.Equal) x y)))
|
|
(bool (vec_cmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.NotEqual) x y)))
|
|
(bool (vec_cmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y)))
|
|
(bool (vec_cmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y)))
|
|
(bool (vec_cmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y)))
|
|
(bool (vec_cmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y)))
|
|
(bool (vec_cmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y)))
|
|
(bool (vec_cmphls ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y)))
|
|
(bool (vec_cmphls ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y)))
|
|
(bool (vec_cmphls ty y x)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y)))
|
|
(bool (vec_cmphls ty y x)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
|
|
;; Short-circuit `vall_true` on the result of a `fcmp` where possible.
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.Equal) x y)))
|
|
(bool (vec_fcmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y)))
|
|
(bool (vec_fcmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y)))
|
|
(bool (vec_fcmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y)))
|
|
(bool (vec_fcmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y)))
|
|
(bool (vec_fcmphes ty x y)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y)))
|
|
(bool (vec_fcmphes ty x y)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThan) x y)))
|
|
(bool (vec_fcmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y)))
|
|
(bool (vec_fcmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y)))
|
|
(bool (vec_fcmphes ty y x)
|
|
(floatcc_as_cond (FloatCC.Equal))))
|
|
(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y)))
|
|
(bool (vec_fcmphes ty y x)
|
|
(floatcc_as_cond (FloatCC.Unordered))))
|
|
|
|
|
|
;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Main `vany_true` entry point. Generate a `ProducesBool` capturing the
|
|
;; comparison and immediately lower it to a 0/1 integer result.
|
|
(rule (lower (has_type (fits_in_64 ty) (vany_true x)))
|
|
(lower_bool ty (vany_true_val x)))
|
|
|
|
;; Return a `ProducesBool` to implement `vany_true`.
|
|
(decl vany_true_val (Value) ProducesBool)
|
|
(rule (vany_true_val x @ (value_type ty))
|
|
(bool (vec_cmpeqs ty x (vec_imm ty 0))
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
|
|
;; Short-circuit `vany_true` on the result of a `icmp`.
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.Equal) x y)))
|
|
(bool (vec_cmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.NotEqual) x y)))
|
|
(bool (vec_cmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y)))
|
|
(bool (vec_cmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y)))
|
|
(bool (vec_cmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y)))
|
|
(bool (vec_cmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y)))
|
|
(bool (vec_cmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y)))
|
|
(bool (vec_cmphls ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y)))
|
|
(bool (vec_cmphls ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y)))
|
|
(bool (vec_cmphls ty y x)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y)))
|
|
(bool (vec_cmphls ty y x)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
|
|
;; Short-circuit `vany_true` on the result of a `fcmp` where possible.
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.Equal) x y)))
|
|
(bool (vec_fcmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y)))
|
|
(bool (vec_fcmpeqs ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y)))
|
|
(bool (vec_fcmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y)))
|
|
(bool (vec_fcmphs ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y)))
|
|
(bool (vec_fcmphes ty x y)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y)))
|
|
(bool (vec_fcmphes ty x y)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThan) x y)))
|
|
(bool (vec_fcmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y)))
|
|
(bool (vec_fcmphs ty y x)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y)))
|
|
(bool (vec_fcmphes ty y x)
|
|
(floatcc_as_cond (FloatCC.Ordered))))
|
|
(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y)))
|
|
(bool (vec_fcmphes ty y x)
|
|
(floatcc_as_cond (FloatCC.NotEqual))))
|
|
|
|
|
|
;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16))))
|
|
(let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56
|
|
64 72 80 88 96 104 112 120))))
|
|
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
|
|
|
|
(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8))))
|
|
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
|
|
0 16 32 48 64 80 96 112))))
|
|
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
|
|
|
|
(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4))))
|
|
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
|
|
128 128 128 128 0 32 64 96))))
|
|
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
|
|
|
|
(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2))))
|
|
(let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128
|
|
128 128 128 128 128 128 0 64))))
|
|
(vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg))))
|
|
|
|
|
|
;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Null references are represented by the constant value 0.
|
|
(rule (lower (has_type $B1 (is_null x @ (value_type $R64))))
|
|
(lower_bool $B1 (bool (icmps_simm16 $I64 x 0)
|
|
(intcc_as_cond (IntCC.Equal)))))
|
|
|
|
|
|
;; Invalid references are represented by the constant value -1.
|
|
(rule (lower (has_type $B1 (is_invalid x @ (value_type $R64))))
|
|
(lower_bool $B1 (bool (icmps_simm16 $I64 x -1)
|
|
(intcc_as_cond (IntCC.Equal)))))
|
|
|
|
|
|
;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Return a `ProducesBool` to capture the fact that the input value is nonzero.
|
|
;; In the common case where that input is the result of an `icmp` or `fcmp`
|
|
;; instruction (possibly via an intermediate `bint`), directly use that compare.
|
|
;; Note that it is not safe to sink memory loads here, see the `icmp` comment.
|
|
(decl value_nonzero (Value) ProducesBool)
|
|
(rule (value_nonzero (bint val)) (value_nonzero val))
|
|
(rule (value_nonzero (icmp int_cc x y)) (icmp_val $false int_cc x y))
|
|
(rule (value_nonzero (fcmp float_cc x y)) (fcmp_val float_cc x y))
|
|
(rule (value_nonzero val @ (value_type (gpr32_ty ty)))
|
|
(bool (icmps_simm16 $I32 (put_in_reg_sext32 val) 0)
|
|
(intcc_as_cond (IntCC.NotEqual))))
|
|
(rule (value_nonzero val @ (value_type (gpr64_ty ty)))
|
|
(bool (icmps_simm16 $I64 (put_in_reg val) 0)
|
|
(intcc_as_cond (IntCC.NotEqual))))
|
|
|
|
;; Main `select` entry point. Lower the `value_nonzero` result.
|
|
(rule (lower (has_type ty (select val_cond val_true val_false)))
|
|
(select_bool_reg ty (value_nonzero val_cond)
|
|
(put_in_reg val_true) (put_in_reg val_false)))
|
|
|
|
|
|
;;;; Rules for `selectif_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; We do not support the `iflags` mechanism on our platform. However, common
|
|
;; code will unconditionally emit certain patterns using `iflags` which we
|
|
;; need to handle somehow. Note that only those specific patterns are
|
|
;; recognized by the code below, other uses will fail to lower.
|
|
|
|
(rule (lower (has_type ty (selectif_spectre_guard int_cc
|
|
(ifcmp x y) val_true val_false)))
|
|
(select_bool_reg ty (icmp_val $false int_cc x y)
|
|
(put_in_reg val_true) (put_in_reg val_false)))
|
|
|
|
|
|
;;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Unconditional branch. The target is found as first (and only) element in
|
|
;; the list of the current block's branch targets passed as `targets`.
|
|
(rule (lower_branch (jump _ _) targets)
|
|
(side_effect (jump_impl (vec_element targets 0))))
|
|
|
|
|
|
;;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Jump table. `targets` contains the default target followed by the
|
|
;; list of branch targets per index value.
|
|
(rule (lower_branch (br_table val_idx _ _) targets)
|
|
(let ((idx Reg (put_in_reg_zext64 val_idx))
|
|
;; Bounds-check the index and branch to default.
|
|
;; This is an internal branch that is not a terminator insn.
|
|
;; Instead, the default target is listed a potential target
|
|
;; in the final JTSequence, which is the block terminator.
|
|
(cond ProducesBool
|
|
(bool (icmpu_uimm32 $I64 idx (vec_length_minus1 targets))
|
|
(intcc_as_cond (IntCC.UnsignedGreaterThanOrEqual))))
|
|
(_ InstOutput (side_effect (oneway_cond_br_bool cond
|
|
(vec_element targets 0)))))
|
|
;; Scale the index by the element size, and then emit the
|
|
;; compound instruction that does:
|
|
;;
|
|
;; larl %r1, <jt-base>
|
|
;; agf %r1, 0(%r1, %rScaledIndex)
|
|
;; br %r1
|
|
;; [jt entries]
|
|
;;
|
|
;; This must be *one* instruction in the vcode because
|
|
;; we cannot allow regalloc to insert any spills/fills
|
|
;; in the middle of the sequence; otherwise, the LARL's
|
|
;; PC-rel offset to the jumptable would be incorrect.
|
|
;; (The alternative is to introduce a relocation pass
|
|
;; for inlined jumptables, which is much worse, IMHO.)
|
|
(side_effect (jt_sequence (lshl_imm $I64 idx 2) targets))))
|
|
|
|
|
|
;;;; Rules for `brz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Two-way conditional branch on zero. `targets` contains:
|
|
;; - element 0: target if the condition is true (i.e. value is zero)
|
|
;; - element 1: target if the condition is false (i.e. value is nonzero)
|
|
(rule (lower_branch (brz val_cond _ _) targets)
|
|
(side_effect (cond_br_bool (invert_bool (value_nonzero val_cond))
|
|
(vec_element targets 0)
|
|
(vec_element targets 1))))
|
|
|
|
|
|
;;;; Rules for `brnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Two-way conditional branch on nonzero. `targets` contains:
|
|
;; - element 0: target if the condition is true (i.e. value is nonzero)
|
|
;; - element 1: target if the condition is false (i.e. value is zero)
|
|
(rule (lower_branch (brnz val_cond _ _) targets)
|
|
(side_effect (cond_br_bool (value_nonzero val_cond)
|
|
(vec_element targets 0)
|
|
(vec_element targets 1))))
|
|
|
|
|
|
;;;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Similarly to `selectif_spectre_guard`, we only recognize specific patterns
|
|
;; generated by common code here. Others will fail to lower.
|
|
|
|
(rule (lower_branch (brif int_cc (ifcmp x y) _ _) targets)
|
|
(side_effect (cond_br_bool (icmp_val $false int_cc x y)
|
|
(vec_element targets 0)
|
|
(vec_element targets 1))))
|
|
|
|
|
|
;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (trap trap_code))
|
|
(side_effect (trap_impl trap_code)))
|
|
|
|
|
|
;;;; Rules for `resumable_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (resumable_trap trap_code))
|
|
(side_effect (trap_impl trap_code)))
|
|
|
|
|
|
;;;; Rules for `trapz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (trapz val trap_code))
|
|
(side_effect (trap_if_bool (invert_bool (value_nonzero val)) trap_code)))
|
|
|
|
|
|
;;;; Rules for `trapnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (trapnz val trap_code))
|
|
(side_effect (trap_if_bool (value_nonzero val) trap_code)))
|
|
|
|
|
|
;;;; Rules for `resumable_trapnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (resumable_trapnz val trap_code))
|
|
(side_effect (trap_if_bool (value_nonzero val) trap_code)))
|
|
|
|
|
|
;;;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (debugtrap))
|
|
(side_effect (debugtrap_impl)))
|
|
|
|
|
|
;;;; Rules for `trapif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Similarly to `selectif_spectre_guard`, we only recognize specific patterns
|
|
;; generated by common code here. Others will fail to lower.
|
|
|
|
;; Recognize the case of `ifcmp` feeding into `trapif`. Directly generate
|
|
;; the desired comparison here; there is no separate `ifcmp` lowering.
|
|
|
|
(rule (lower (trapif int_cc (ifcmp x y) trap_code))
|
|
(side_effect (trap_if_bool (icmp_val $false int_cc x y) trap_code)))
|
|
|
|
;; Recognize the case of `iadd_ifcout` feeding into `trapif`. Note that
|
|
;; in the case, the `iadd_ifcout` is generated by a separate lowering
|
|
;; (in order to properly handle the register output of that instruction.)
|
|
;;
|
|
;; The flags must not have been clobbered by any other instruction between the
|
|
;; iadd_ifcout and this instruction, as verified by the CLIF validator; so we
|
|
;; can simply rely on the condition code here.
|
|
;;
|
|
;; IaddIfcout is implemented via a ADD LOGICAL instruction, which sets the
|
|
;; the condition code as follows:
|
|
;; 0 Result zero; no carry
|
|
;; 1 Result not zero; no carry
|
|
;; 2 Result zero; carry
|
|
;; 3 Result not zero; carry
|
|
;; This means "carry" corresponds to condition code 2 or 3, i.e.
|
|
;; a condition mask of 2 | 1.
|
|
;;
|
|
;; As this does not match any of the encodings used with a normal integer
|
|
;; comparsion, this cannot be represented by any IntCC value. We need to
|
|
;; remap the IntCC::UnsignedGreaterThan value that we have here as result
|
|
;; of the unsigned_add_overflow_condition call to the correct mask.
|
|
|
|
(rule (lower (trapif (IntCC.UnsignedGreaterThan)
|
|
(iadd_ifcout x y) trap_code))
|
|
(side_effect (trap_if_impl (mask_as_cond 3) trap_code)))
|
|
|
|
|
|
;;;; Rules for `return` and `fallthrough_return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (return args))
|
|
(lower_return (range 0 (value_slice_len args)) args))
|
|
|
|
(rule (lower (fallthrough_return args))
|
|
(lower_return (range 0 (value_slice_len args)) args))
|
|
|
|
(decl lower_return (Range ValueSlice) InstOutput)
|
|
(rule (lower_return (range_empty) _) (output_none))
|
|
(rule (lower_return (range_unwrap head tail) args)
|
|
(let ((_ Unit (copy_to_regs (retval head) (value_slice_get args head))))
|
|
(lower_return tail args)))
|
|
|
|
|
|
;;;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Direct call to an in-range function.
|
|
(rule (lower (call (func_ref_data sig_ref name (reloc_distance_near)) args))
|
|
(let ((abi ABISig (abi_sig sig_ref))
|
|
(_ Unit (abi_accumulate_outgoing_args_size abi))
|
|
(_ InstOutput (lower_call_args abi (range 0 (abi_num_args abi)) args))
|
|
(_ InstOutput (side_effect (abi_call abi name (Opcode.Call)))))
|
|
(lower_call_rets abi (range 0 (abi_num_rets abi)) (output_builder_new))))
|
|
|
|
;; Direct call to an out-of-range function (implicitly via pointer).
|
|
(rule (lower (call (func_ref_data sig_ref name _) args))
|
|
(let ((abi ABISig (abi_sig sig_ref))
|
|
(_ Unit (abi_accumulate_outgoing_args_size abi))
|
|
(_ InstOutput (lower_call_args abi (range 0 (abi_num_args abi)) args))
|
|
(target Reg (load_ext_name_far name 0))
|
|
(_ InstOutput (side_effect (abi_call_ind abi target (Opcode.Call)))))
|
|
(lower_call_rets abi (range 0 (abi_num_rets abi)) (output_builder_new))))
|
|
|
|
;; Indirect call.
|
|
(rule (lower (call_indirect sig_ref ptr args))
|
|
(let ((abi ABISig (abi_sig sig_ref))
|
|
(target Reg (put_in_reg ptr))
|
|
(_ Unit (abi_accumulate_outgoing_args_size abi))
|
|
(_ InstOutput (lower_call_args abi (range 0 (abi_num_args abi)) args))
|
|
(_ InstOutput (side_effect (abi_call_ind abi target (Opcode.CallIndirect)))))
|
|
(lower_call_rets abi (range 0 (abi_num_rets abi)) (output_builder_new))))
|
|
|
|
;; Lower function arguments by loading them into registers / stack slots.
|
|
(decl lower_call_args (ABISig Range ValueSlice) InstOutput)
|
|
(rule (lower_call_args abi (range_empty) _) (lower_call_ret_arg abi))
|
|
(rule (lower_call_args abi (range_unwrap head tail) args)
|
|
(let ((idx usize (abi_copy_to_arg_order abi head))
|
|
(_ Unit (copy_to_arg 0 (abi_get_arg abi idx)
|
|
(value_slice_get args idx))))
|
|
(lower_call_args abi tail args)))
|
|
|
|
;; Lower the implicit return-area pointer argument, if present.
|
|
(decl lower_call_ret_arg (ABISig) InstOutput)
|
|
(rule (lower_call_ret_arg (abi_no_ret_arg)) (output_none))
|
|
(rule (lower_call_ret_arg abi @ (abi_ret_arg (abi_arg_only_slot slot)))
|
|
(let ((ret_arg Reg (load_addr (memarg_stack_off (abi_sized_stack_arg_space abi) 0)))
|
|
(_ Unit (copy_reg_to_arg_slot 0 slot ret_arg)))
|
|
(output_none)))
|
|
|
|
;; Lower function return values by collecting them from registers / stack slots.
|
|
(decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput)
|
|
(rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder))
|
|
(rule (lower_call_rets abi (range_unwrap head tail) builder)
|
|
(let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head)))
|
|
(_ Unit (output_builder_push builder ret)))
|
|
(lower_call_rets abi tail builder)))
|
|
|