;; aarch64 instruction selection and CLIF-to-MachInst lowering. ;; The main lowering constructor term: takes a clif `Inst` and returns the ;; register(s) within which the lowered instruction's result values live. (decl lower (Inst) ValueRegs) ;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (iconst (u64_from_imm64 n)))) (imm ty n)) ;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (bconst $false))) (imm ty 0)) (rule (lower (has_type ty (bconst $true))) (imm ty 1)) ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (null))) (imm ty 0)) ;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller ;; Base case, simply adding things in registers. (rule (lower (has_type (fits_in_64 ty) (iadd x y))) (add ty x y)) ;; Special cases for when one operand is an immediate that fits in 12 bits. (rule (lower (has_type (fits_in_64 ty) (iadd x (imm12_from_value y)))) (add_imm ty x y)) (rule (lower (has_type (fits_in_64 ty) (iadd (imm12_from_value x) y))) (add_imm ty y x)) ;; Same as the previous special cases, except we can switch the addition to a ;; subtraction if the negated immediate fits in 12 bits. (rule (lower (has_type (fits_in_64 ty) (iadd x (imm12_from_negated_value y)))) (sub_imm ty x y)) (rule (lower (has_type (fits_in_64 ty) (iadd (imm12_from_negated_value x) y))) (sub_imm ty y x)) ;; Special cases for when we're adding an extended register where the extending ;; operation can get folded into the add itself. (rule (lower (has_type (fits_in_64 ty) (iadd x (extended_value_from_value y)))) (add_extend ty x y)) (rule (lower (has_type (fits_in_64 ty) (iadd (extended_value_from_value x) y))) (add_extend ty y x)) ;; Special cases for when we're adding the shift of a different ;; register by a constant amount and the shift can get folded into the add. (rule (lower (has_type (fits_in_64 ty) (iadd x (ishl y (iconst (lshl_from_imm64 ORR_NOT rd, zero, rm (rule (lower (has_type (fits_in_64 ty) (bnot x))) (orr_not ty (zero_reg) x)) ;; Special case to use `orr_not_shift` if it's a `bnot` of a const-left-shifted ;; value. (rule (lower (has_type (fits_in_64 ty) (bnot (ishl x (iconst (lshl_from_imm64 ;; ;; and masked_amt, amt, ;; sub tmp_sub, masked_amt, ;; sub neg_amt, zero, tmp_sub ; neg ;; lsr val_rshift, val, masked_amt ;; lsl val_lshift, val, neg_amt ;; orr rd, val_lshift val_rshift (decl small_rotr (Type Reg Reg) Reg) (rule (small_rotr ty val amt) (let ((masked_amt Reg (and_imm $I32 amt (rotr_mask ty))) (tmp_sub Reg (sub_imm $I32 masked_amt (u8_into_imm12 (ty_bits ty)))) (neg_amt Reg (sub $I32 (zero_reg) tmp_sub)) (val_rshift Reg (lsr $I32 val masked_amt)) (val_lshift Reg (lsl $I32 val neg_amt))) (orr $I32 val_lshift val_rshift))) (decl rotr_mask (Type) ImmLogic) (extern constructor rotr_mask rotr_mask) ;; For a constant amount, we can instead do: ;; ;; rotr rd, val, #amt ;; ;; => ;; ;; lsr val_rshift, val, # ;; lsl val_lshift, val, ;; orr rd, val_lshift, val_rshift (decl small_rotr_imm (Type Reg ImmShift) Reg) (rule (small_rotr_imm ty val amt) (let ((val_rshift Reg (lsr_imm $I32 val amt)) (val_lshift Reg (lsl_imm $I32 val (rotr_opposite_amount ty amt)))) (orr $I32 val_lshift val_rshift))) (decl rotr_opposite_amount (Type ImmShift) ImmShift) (extern constructor rotr_opposite_amount rotr_opposite_amount) ;; General 128-bit case. ;; ;; TODO: much better codegen is possible with a constant amount. (rule (lower (has_type $I128 (rotr x y))) (let ((val ValueRegs x) (amt Reg (value_regs_get y 0)) (neg_amt Reg (sub $I64 (imm $I64 128) amt)) (rshift ValueRegs (lower_ushr128 val amt)) (lshift ValueRegs (lower_shl128 val neg_amt)) (hi Reg (orr $I64 (value_regs_get rshift 1) (value_regs_get lshift 1))) (lo Reg (orr $I64 (value_regs_get rshift 0) (value_regs_get lshift 0)))) (value_regs lo hi))) ;;;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Reversing an 8-bit value with a 32-bit bitrev instruction will place ;; the reversed result in the highest 8 bits, so we need to shift them down into ;; place. (rule (lower (has_type $I8 (bitrev x))) (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 24))) ;; Reversing an 16-bit value with a 32-bit bitrev instruction will place ;; the reversed result in the highest 16 bits, so we need to shift them down into ;; place. (rule (lower (has_type $I16 (bitrev x))) (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 16))) (rule (lower (has_type $I128 (bitrev x))) (let ((val ValueRegs x) (lo_rev Reg (rbit $I64 (value_regs_get val 0))) (hi_rev Reg (rbit $I64 (value_regs_get val 1)))) (value_regs hi_rev lo_rev))) (rule (lower (has_type ty (bitrev x))) (rbit ty x)) ;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (clz x))) (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 24))) (rule (lower (has_type $I16 (clz x))) (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 16))) (rule (lower (has_type $I128 (clz x))) (lower_clz128 x)) (rule (lower (has_type ty (clz x))) (a64_clz ty x)) ;; clz hi_clz, hi ;; clz lo_clz, lo ;; lsr tmp, hi_clz, #6 ;; madd dst_lo, lo_clz, tmp, hi_clz ;; mov dst_hi, 0 (decl lower_clz128 (ValueRegs) ValueRegs) (rule (lower_clz128 val) (let ((hi_clz Reg (a64_clz $I64 (value_regs_get val 1))) (lo_clz Reg (a64_clz $I64 (value_regs_get val 0))) (tmp Reg (lsr_imm $I64 hi_clz (imm_shift_from_u8 6)))) (value_regs (madd64 lo_clz tmp hi_clz) (imm $I64 0)))) ;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Note that all `ctz` instructions are implemented by reversing the bits and ;; then using a `clz` instruction since the tail zeros are the same as the ;; leading zeros of the reversed value. (rule (lower (has_type $I8 (ctz x))) (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x800000)))) (rule (lower (has_type $I16 (ctz x))) (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x8000)))) (rule (lower (has_type $I128 (ctz x))) (let ((val ValueRegs x) (lo Reg (rbit $I64 (value_regs_get val 0))) (hi Reg (rbit $I64 (value_regs_get val 1)))) (lower_clz128 (value_regs hi lo)))) (rule (lower (has_type ty (ctz x))) (a64_clz ty (rbit ty x))) ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (cls x))) (sub_imm $I32 (a64_cls $I32 (put_in_reg_zext32 x)) (u8_into_imm12 24))) (rule (lower (has_type $I16 (cls x))) (sub_imm $I32 (a64_cls $I32 (put_in_reg_zext32 x)) (u8_into_imm12 16))) ;; cls lo_cls, lo ;; cls hi_cls, hi ;; eon sign_eq_eor, hi, lo ;; lsr sign_eq, sign_eq_eor, #63 ;; madd lo_sign_bits, out_lo, sign_eq, sign_eq ;; cmp hi_cls, #63 ;; csel maybe_lo, lo_sign_bits, xzr, eq ;; add out_lo, maybe_lo, hi_cls ;; mov out_hi, 0 (rule (lower (has_type $I128 (cls x))) (let ((val ValueRegs x) (lo Reg (value_regs_get val 0)) (hi Reg (value_regs_get val 1)) (lo_cls Reg (a64_cls $I64 lo)) (hi_cls Reg (a64_cls $I64 hi)) (sign_eq_eon Reg (eon $I64 hi lo)) (sign_eq Reg (lsr_imm $I64 sign_eq_eon (imm_shift_from_u8 63))) (lo_sign_bits Reg (madd64 lo_cls sign_eq sign_eq)) (maybe_lo Reg (with_flags_reg (cmp64_imm hi_cls (u8_into_imm12 63)) (csel (Cond.Eq) lo_sign_bits (zero_reg))))) (value_regs (add $I64 maybe_lo hi_cls) (imm $I64 0)))) (rule (lower (has_type ty (cls x))) (a64_cls ty x)) ;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The implementation of `popcnt` for scalar types is done by moving the value ;; into a vector register, using the `cnt` instruction, and then collating the ;; result back into a normal register. ;; ;; The general sequence emitted here is ;; ;; fmov tmp, in_lo ;; if ty == i128: ;; mov tmp.d[1], in_hi ;; ;; cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b ;; addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs) ;; ;; umov out_lo, tmp.b[0] ;; if ty == i128: ;; mov out_hi, 0 (rule (lower (has_type $I8 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))) (mov_from_vec nbits 0 (VectorSize.Size8x16)))) ;; Note that this uses `addp` instead of `addv` as it's usually cheaper. (rule (lower (has_type $I16 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addp nbits nbits (VectorSize.Size8x8)))) (mov_from_vec added 0 (VectorSize.Size8x16)))) (rule (lower (has_type $I32 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addv nbits (VectorSize.Size8x8)))) (mov_from_vec added 0 (VectorSize.Size8x16)))) (rule (lower (has_type $I64 (popcnt x))) (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) (added Reg (addv nbits (VectorSize.Size8x8)))) (mov_from_vec added 0 (VectorSize.Size8x16)))) (rule (lower (has_type $I128 (popcnt x))) (let ((val ValueRegs x) (tmp_half Reg (mov_to_fpu (value_regs_get val 0) (ScalarSize.Size64))) (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2))) (nbits Reg (vec_cnt tmp (VectorSize.Size8x16))) (added Reg (addv nbits (VectorSize.Size8x16)))) (value_regs (mov_from_vec added 0 (VectorSize.Size8x16)) (imm $I64 0)))) (rule (lower (has_type $I8X16 (popcnt x))) (vec_cnt x (VectorSize.Size8x16)))