This commit translates the `rotl` and `rotr` lowerings already existing to ISLE. The port was relatively straightforward with the biggest changing being the instructions generated around i128 rotl/rotr primarily due to register changes.
1039 lines
43 KiB
Common Lisp
1039 lines
43 KiB
Common Lisp
;; aarch64 instruction selection and CLIF-to-MachInst lowering.
|
|
|
|
;; The main lowering constructor term: takes a clif `Inst` and returns the
|
|
;; register(s) within which the lowered instruction's result values live.
|
|
(decl lower (Inst) ValueRegs)
|
|
|
|
;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (iconst (u64_from_imm64 n))))
|
|
(value_reg (imm ty n)))
|
|
|
|
;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (bconst $false)))
|
|
(value_reg (imm ty 0)))
|
|
|
|
(rule (lower (has_type ty (bconst $true)))
|
|
(value_reg (imm ty 1)))
|
|
|
|
;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (null)))
|
|
(value_reg (imm ty 0)))
|
|
|
|
;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller
|
|
|
|
;; Base case, simply adding things in registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x y)))
|
|
(value_reg (alu_rrr (iadd_op ty) (put_in_reg x) (put_in_reg y))))
|
|
|
|
;; Special cases for when one operand is an immediate that fits in 12 bits.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (imm12_from_value y))))
|
|
(value_reg (alu_rr_imm12 (iadd_op ty) (put_in_reg x) y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (imm12_from_value x) y)))
|
|
(value_reg (alu_rr_imm12 (iadd_op ty) (put_in_reg y) x)))
|
|
|
|
;; Same as the previous special cases, except we can switch the addition to a
|
|
;; subtraction if the negated immediate fits in 12 bits.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (imm12_from_negated_value y))))
|
|
(value_reg (alu_rr_imm12 (isub_op ty) (put_in_reg x) y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (imm12_from_negated_value x) y)))
|
|
(value_reg (alu_rr_imm12 (isub_op ty) (put_in_reg y) x)))
|
|
|
|
;; Special cases for when we're adding an extended register where the extending
|
|
;; operation can get folded into the add itself.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (extended_value_from_value y))))
|
|
(value_reg (alu_rr_extend_reg (iadd_op ty) (put_in_reg x) y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (extended_value_from_value x) y)))
|
|
(value_reg (alu_rr_extend_reg (iadd_op ty) (put_in_reg y) x)))
|
|
|
|
;; Special cases for when we're adding the shift of a different
|
|
;; register by a constant amount and the shift can get folded into the add.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd x (def_inst (ishl y (def_inst (iconst (lshl_from_imm64 <ty amt))))))))
|
|
(value_reg (alu_rrr_shift (iadd_op ty) (put_in_reg x) (put_in_reg y) amt)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd (def_inst (ishl x (def_inst (iconst (lshl_from_imm64 <ty amt))))) y)))
|
|
(value_reg (alu_rrr_shift (iadd_op ty) (put_in_reg y) (put_in_reg x) amt)))
|
|
|
|
;; Fold an `iadd` and `imul` combination into a `madd` instruction.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd x (def_inst (imul y z)))))
|
|
(value_reg (alu_rrrr (madd_op ty) (put_in_reg y) (put_in_reg z) (put_in_reg x))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd (def_inst (imul x y)) z)))
|
|
(value_reg (alu_rrrr (madd_op ty) (put_in_reg x) (put_in_reg y) (put_in_reg z))))
|
|
|
|
;; Helper to use either a 32 or 64-bit add depending on the input type.
|
|
(decl iadd_op (Type) ALUOp)
|
|
(rule (iadd_op (fits_in_32 _ty)) (ALUOp.Add32))
|
|
(rule (iadd_op $I64) (ALUOp.Add64))
|
|
|
|
;; Helper to use either a 32 or 64-bit sub depending on the input type.
|
|
(decl isub_op (Type) ALUOp)
|
|
(rule (isub_op (fits_in_32 _ty)) (ALUOp.Sub32))
|
|
(rule (isub_op $I64) (ALUOp.Sub64))
|
|
|
|
;; Helper to use either a 32 or 64-bit madd depending on the input type.
|
|
(decl madd_op (Type) ALUOp3)
|
|
(rule (madd_op (fits_in_32 _ty)) (ALUOp3.MAdd32))
|
|
(rule (madd_op $I64) (ALUOp3.MAdd64))
|
|
|
|
;; vectors
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _ _) (iadd x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Add) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;; `i128`
|
|
(rule (lower (has_type $I128 (iadd x y)))
|
|
(let (
|
|
;; Get the high/low registers for `x`.
|
|
(x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
|
|
;; Get the high/low registers for `y`.
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1))
|
|
)
|
|
;; the actual addition is `adds` followed by `adc` which comprises the
|
|
;; low/high bits of the result
|
|
(with_flags
|
|
(add64_with_flags x_lo y_lo)
|
|
(adc64 x_hi y_hi))))
|
|
|
|
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller
|
|
|
|
;; Base case, simply subtracting things in registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x y)))
|
|
(value_reg (alu_rrr (isub_op ty) (put_in_reg x) (put_in_reg y))))
|
|
|
|
;; Special case for when one operand is an immediate that fits in 12 bits.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (imm12_from_value y))))
|
|
(value_reg (alu_rr_imm12 (isub_op ty) (put_in_reg x) y)))
|
|
|
|
;; Same as the previous special case, except we can switch the subtraction to an
|
|
;; addition if the negated immediate fits in 12 bits.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (imm12_from_negated_value y))))
|
|
(value_reg (alu_rr_imm12 (iadd_op ty) (put_in_reg x) y)))
|
|
|
|
;; Special cases for when we're subtracting an extended register where the
|
|
;; extending operation can get folded into the sub itself.
|
|
(rule (lower (has_type (fits_in_64 ty) (isub x (extended_value_from_value y))))
|
|
(value_reg (alu_rr_extend_reg (isub_op ty) (put_in_reg x) y)))
|
|
|
|
;; Finally a special case for when we're subtracting the shift of a different
|
|
;; register by a constant amount and the shift can get folded into the sub.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(isub x (def_inst (ishl y (def_inst (iconst (lshl_from_imm64 <ty amt))))))))
|
|
(value_reg (alu_rrr_shift (isub_op ty) (put_in_reg x) (put_in_reg y) amt)))
|
|
|
|
;; vectors
|
|
(rule (lower (has_type ty @ (multi_lane _ _) (isub x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Sub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;; `i128`
|
|
(rule (lower (has_type $I128 (isub x y)))
|
|
(let (
|
|
;; Get the high/low registers for `x`.
|
|
(x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
|
|
;; Get the high/low registers for `y`.
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1))
|
|
)
|
|
;; the actual subtraction is `subs` followed by `sbc` which comprises
|
|
;; the low/high bits of the result
|
|
(with_flags
|
|
(sub64_with_flags x_lo y_lo)
|
|
(sbc64 x_hi y_hi))))
|
|
|
|
;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (vec128 ty) (uadd_sat x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Uqadd) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (vec128 ty) (sadd_sat x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Sqadd) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (vec128 ty) (usub_sat x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Uqsub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (vec128 ty) (ssub_sat x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Sqsub) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
(rule (lower (has_type (fits_in_64 ty) (ineg x)))
|
|
(value_reg (alu_rrr (isub_op ty) (zero_reg) (put_in_reg x))))
|
|
|
|
;; vectors.
|
|
(rule (lower (has_type (vec128 ty) (ineg x)))
|
|
(value_reg (vec_misc (VecMisc2.Neg) (put_in_reg x) (vector_size ty))))
|
|
|
|
;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x y)))
|
|
(value_reg (alu_rrrr (madd_op ty) (put_in_reg x) (put_in_reg y) (zero_reg))))
|
|
|
|
;; `i128`.
|
|
(rule (lower (has_type $I128 (imul x y)))
|
|
(let (
|
|
;; Get the high/low registers for `x`.
|
|
(x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
|
|
;; Get the high/low registers for `y`.
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1))
|
|
|
|
;; 128bit mul formula:
|
|
;; dst_lo = x_lo * y_lo
|
|
;; dst_hi = umulhi(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo)
|
|
;;
|
|
;; We can convert the above formula into the following
|
|
;; umulh dst_hi, x_lo, y_lo
|
|
;; madd dst_hi, x_lo, y_hi, dst_hi
|
|
;; madd dst_hi, x_hi, y_lo, dst_hi
|
|
;; madd dst_lo, x_lo, y_lo, zero
|
|
(dst_hi1 Reg (alu_rrr (ALUOp.UMulH) x_lo y_lo))
|
|
(dst_hi2 Reg (alu_rrrr (ALUOp3.MAdd64) x_lo y_hi dst_hi1))
|
|
(dst_hi Reg (alu_rrrr (ALUOp3.MAdd64) x_hi y_lo dst_hi2))
|
|
(dst_lo Reg (alu_rrrr (ALUOp3.MAdd64) x_lo y_lo (zero_reg)))
|
|
)
|
|
(value_regs dst_lo dst_hi)))
|
|
|
|
;; Case for i8x16, i16x8, and i32x4.
|
|
(rule (lower (has_type (vec128 ty @ (not_i64x2)) (imul x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Mul) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;; Special lowering for i64x2.
|
|
;;
|
|
;; This I64X2 multiplication is performed with several 32-bit
|
|
;; operations.
|
|
;;
|
|
;; 64-bit numbers x and y, can be represented as:
|
|
;; x = a + 2^32(b)
|
|
;; y = c + 2^32(d)
|
|
;;
|
|
;; A 64-bit multiplication is:
|
|
;; x * y = ac + 2^32(ad + bc) + 2^64(bd)
|
|
;; note: `2^64(bd)` can be ignored, the value is too large to fit in
|
|
;; 64 bits.
|
|
;;
|
|
;; This sequence implements a I64X2 multiply, where the registers
|
|
;; `rn` and `rm` are split up into 32-bit components:
|
|
;; rn = |d|c|b|a|
|
|
;; rm = |h|g|f|e|
|
|
;;
|
|
;; rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
|
|
;;
|
|
;; The sequence is:
|
|
;; rev64 rd.4s, rm.4s
|
|
;; mul rd.4s, rd.4s, rn.4s
|
|
;; xtn tmp1.2s, rn.2d
|
|
;; addp rd.4s, rd.4s, rd.4s
|
|
;; xtn tmp2.2s, rm.2d
|
|
;; shll rd.2d, rd.2s, #32
|
|
;; umlal rd.2d, tmp2.2s, tmp1.2s
|
|
(rule (lower (has_type $I64X2 (imul x y)))
|
|
(let (
|
|
(rn Reg (put_in_reg x))
|
|
(rm Reg (put_in_reg y))
|
|
;; Reverse the 32-bit elements in the 64-bit words.
|
|
;; rd = |g|h|e|f|
|
|
(rev Reg (vec_misc (VecMisc2.Rev64) rm (VectorSize.Size32x4)))
|
|
|
|
;; Calculate the high half components.
|
|
;; rd = |dg|ch|be|af|
|
|
;;
|
|
;; Note that this 32-bit multiply of the high half
|
|
;; discards the bits that would overflow, same as
|
|
;; if 64-bit operations were used. Also the Shll
|
|
;; below would shift out the overflow bits anyway.
|
|
(mul Reg (vec_rrr (VecALUOp.Mul) rev rn (VectorSize.Size32x4)))
|
|
|
|
;; Extract the low half components of rn.
|
|
;; tmp1 = |c|a|
|
|
(tmp1 Reg (vec_rr_narrow (VecRRNarrowOp.Xtn64) rn $false))
|
|
|
|
;; Sum the respective high half components.
|
|
;; rd = |dg+ch|be+af||dg+ch|be+af|
|
|
(sum Reg (vec_rrr (VecALUOp.Addp) mul mul (VectorSize.Size32x4)))
|
|
|
|
;; Extract the low half components of rm.
|
|
;; tmp2 = |g|e|
|
|
(tmp2 Reg (vec_rr_narrow (VecRRNarrowOp.Xtn64) rm $false))
|
|
|
|
;; Shift the high half components, into the high half.
|
|
;; rd = |dg+ch << 32|be+af << 32|
|
|
(shift Reg (vec_rr_long (VecRRLongOp.Shll32) sum $false))
|
|
|
|
;; Multiply the low components together, and accumulate with the high
|
|
;; half.
|
|
;; rd = |rd[1] + cg|rd[0] + ae|
|
|
(result Reg (vec_rrrr_long (VecRRRLongOp.Umlal32) shift tmp2 tmp1 $false))
|
|
)
|
|
(value_reg result)))
|
|
|
|
;; Special case for `i16x8.extmul_low_i8x16_s`.
|
|
(rule (lower (has_type $I16X8
|
|
(imul (def_inst (swiden_low x @ (value_type $I8X16)))
|
|
(def_inst (swiden_low y @ (value_type $I8X16))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull8) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i16x8.extmul_high_i8x16_s`.
|
|
(rule (lower (has_type $I16X8
|
|
(imul (def_inst (swiden_high x @ (value_type $I8X16)))
|
|
(def_inst (swiden_high y @ (value_type $I8X16))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull8) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;; Special case for `i16x8.extmul_low_i8x16_u`.
|
|
(rule (lower (has_type $I16X8
|
|
(imul (def_inst (uwiden_low x @ (value_type $I8X16)))
|
|
(def_inst (uwiden_low y @ (value_type $I8X16))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull8) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i16x8.extmul_high_i8x16_u`.
|
|
(rule (lower (has_type $I16X8
|
|
(imul (def_inst (uwiden_high x @ (value_type $I8X16)))
|
|
(def_inst (uwiden_high y @ (value_type $I8X16))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull8) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;; Special case for `i32x4.extmul_low_i16x8_s`.
|
|
(rule (lower (has_type $I32X4
|
|
(imul (def_inst (swiden_low x @ (value_type $I16X8)))
|
|
(def_inst (swiden_low y @ (value_type $I16X8))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull16) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i32x4.extmul_high_i16x8_s`.
|
|
(rule (lower (has_type $I32X4
|
|
(imul (def_inst (swiden_high x @ (value_type $I16X8)))
|
|
(def_inst (swiden_high y @ (value_type $I16X8))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull16) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;; Special case for `i32x4.extmul_low_i16x8_u`.
|
|
(rule (lower (has_type $I32X4
|
|
(imul (def_inst (uwiden_low x @ (value_type $I16X8)))
|
|
(def_inst (uwiden_low y @ (value_type $I16X8))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull16) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i32x4.extmul_high_i16x8_u`.
|
|
(rule (lower (has_type $I32X4
|
|
(imul (def_inst (uwiden_high x @ (value_type $I16X8)))
|
|
(def_inst (uwiden_high y @ (value_type $I16X8))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull16) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;; Special case for `i64x2.extmul_low_i32x4_s`.
|
|
(rule (lower (has_type $I64X2
|
|
(imul (def_inst (swiden_low x @ (value_type $I32X4)))
|
|
(def_inst (swiden_low y @ (value_type $I32X4))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull32) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i64x2.extmul_high_i32x4_s`.
|
|
(rule (lower (has_type $I64X2
|
|
(imul (def_inst (swiden_high x @ (value_type $I32X4)))
|
|
(def_inst (swiden_high y @ (value_type $I32X4))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Smull32) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;; Special case for `i64x2.extmul_low_i32x4_u`.
|
|
(rule (lower (has_type $I64X2
|
|
(imul (def_inst (uwiden_low x @ (value_type $I32X4)))
|
|
(def_inst (uwiden_low y @ (value_type $I32X4))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull32) (put_in_reg x) (put_in_reg y) $false)))
|
|
|
|
;; Special case for `i64x2.extmul_high_i32x4_u`.
|
|
(rule (lower (has_type $I64X2
|
|
(imul (def_inst (uwiden_high x @ (value_type $I32X4)))
|
|
(def_inst (uwiden_high y @ (value_type $I32X4))))))
|
|
(value_reg (vec_rrr_long (VecRRRLongOp.Umull32) (put_in_reg x) (put_in_reg y) $true)))
|
|
|
|
;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I64 (smulhi x y)))
|
|
(value_reg (alu_rrr (ALUOp.SMulH) (put_in_reg x) (put_in_reg y))))
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (smulhi x y)))
|
|
(let (
|
|
(x64 Reg (put_in_reg_sext64 x))
|
|
(y64 Reg (put_in_reg_sext64 y))
|
|
(mul Reg (alu_rrrr (ALUOp3.MAdd64) x64 y64 (zero_reg)))
|
|
(result Reg (alu_rr_imm_shift (ALUOp.Asr64) mul (imm_shift_from_u8 (ty_bits ty))))
|
|
)
|
|
(value_reg result)))
|
|
|
|
;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I64 (umulhi x y)))
|
|
(value_reg (alu_rrr (ALUOp.UMulH) (put_in_reg x) (put_in_reg y))))
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (umulhi x y)))
|
|
(let (
|
|
(x64 Reg (put_in_reg_zext64 x))
|
|
(y64 Reg (put_in_reg_zext64 y))
|
|
(mul Reg (alu_rrrr (ALUOp3.MAdd64) x64 y64 (zero_reg)))
|
|
(result Reg (alu_rr_imm_shift (ALUOp.Lsr64) mul (imm_shift_from_u8 (ty_bits ty))))
|
|
)
|
|
(value_reg result)))
|
|
|
|
;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; TODO: Add UDiv32 to implement 32-bit directly, rather
|
|
;; than extending the input.
|
|
;;
|
|
;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of
|
|
;; CLIF's `udiv` the check for zero needs to be manually performed.
|
|
(rule (lower (has_type (fits_in_64 ty) (udiv x y)))
|
|
(value_reg (alu_rrr (ALUOp.UDiv64)
|
|
(put_in_reg_zext64 x)
|
|
(put_nonzero_in_reg_zext64 y))))
|
|
|
|
;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
|
|
(decl put_nonzero_in_reg_zext64 (Value) Reg)
|
|
(rule (put_nonzero_in_reg_zext64 val)
|
|
(trap_if_zero_divisor (put_in_reg_zext64 val)))
|
|
|
|
;; Special case where if a `Value` is known to be nonzero we can trivially
|
|
;; move it into a register.
|
|
(rule (put_nonzero_in_reg_zext64 (and (value_type ty)
|
|
(def_inst (iconst (nonzero_u64_from_imm64 n)))))
|
|
(imm ty n))
|
|
|
|
;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; TODO: Add SDiv32 to implement 32-bit directly, rather
|
|
;; than extending the input.
|
|
;;
|
|
;; The sequence of checks here should look like:
|
|
;;
|
|
;; cbnz rm, #8
|
|
;; udf ; divide by zero
|
|
;; cmn rm, 1
|
|
;; ccmp rn, 1, #nzcv, eq
|
|
;; b.vc #8
|
|
;; udf ; signed overflow
|
|
;;
|
|
;; Note The div instruction does not trap on divide by zero or overflow, so
|
|
;; checks need to be manually inserted.
|
|
;;
|
|
;; TODO: if `y` is -1 then a check that `x` is not INT_MIN is all that's
|
|
;; necessary, but right now `y` is checked to not be -1 as well.
|
|
(rule (lower (has_type (fits_in_64 ty) (sdiv x y)))
|
|
(let (
|
|
(x64 Reg (put_in_reg_sext64 x))
|
|
(y64 Reg (put_nonzero_in_reg_sext64 y))
|
|
(valid_x64 Reg (trap_if_div_overflow ty x64 y64))
|
|
(result Reg (alu_rrr (ALUOp.SDiv64) valid_x64 y64))
|
|
)
|
|
(value_reg result)))
|
|
|
|
;; Helper for extracting an immediate that's not 0 and not -1 from an imm64.
|
|
(decl safe_divisor_from_imm64 (u64) Imm64)
|
|
(extern extractor safe_divisor_from_imm64 safe_divisor_from_imm64)
|
|
|
|
;; Special case for `sdiv` where no checks are needed due to division by a
|
|
;; constant meaning the checks are always passed.
|
|
(rule (lower (has_type (fits_in_64 ty) (sdiv x (def_inst (iconst (safe_divisor_from_imm64 y))))))
|
|
(value_reg (alu_rrr (ALUOp.SDiv64)
|
|
(put_in_reg_sext64 x)
|
|
(imm ty y))))
|
|
|
|
;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
|
|
(decl put_nonzero_in_reg_sext64 (Value) Reg)
|
|
(rule (put_nonzero_in_reg_sext64 val)
|
|
(trap_if_zero_divisor (put_in_reg_sext64 val)))
|
|
|
|
;; Note that this has a special case where if the `Value` is a constant that's
|
|
;; not zero we can skip the zero check.
|
|
(rule (put_nonzero_in_reg_sext64 (and (value_type ty)
|
|
(def_inst (iconst (nonzero_u64_from_imm64 n)))))
|
|
(imm ty n))
|
|
|
|
;;;; Rules for `urem` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Remainder (x % y) is implemented as:
|
|
;;
|
|
;; tmp = x / y
|
|
;; result = x - (tmp*y)
|
|
;;
|
|
;; use 'result' for tmp and you have:
|
|
;;
|
|
;; cbnz y, #8 ; branch over trap
|
|
;; udf ; divide by zero
|
|
;; div rd, x, y ; rd = x / y
|
|
;; msub rd, rd, y, x ; rd = x - rd * y
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (urem x y)))
|
|
(let (
|
|
(x64 Reg (put_in_reg_zext64 x))
|
|
(y64 Reg (put_nonzero_in_reg_zext64 y))
|
|
(div Reg (alu_rrr (ALUOp.UDiv64) x64 y64))
|
|
(result Reg (alu_rrrr (ALUOp3.MSub64) div y64 x64))
|
|
)
|
|
(value_reg result)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (srem x y)))
|
|
(let (
|
|
(x64 Reg (put_in_reg_sext64 x))
|
|
(y64 Reg (put_nonzero_in_reg_sext64 y))
|
|
(div Reg (alu_rrr (ALUOp.SDiv64) x64 y64))
|
|
(result Reg (alu_rrrr (ALUOp3.MSub64) div y64 x64))
|
|
)
|
|
(value_reg result)))
|
|
|
|
;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; General rule for extending input to an output which fits in a single
|
|
;; register.
|
|
(rule (lower (has_type (fits_in_64 out) (uextend x @ (value_type in))))
|
|
(value_reg (extend (put_in_reg x) $false (ty_bits in) (ty_bits out))))
|
|
|
|
;; Extraction of a vector lane automatically extends as necessary, so we can
|
|
;; skip an explicit extending instruction.
|
|
(rule (lower (has_type (fits_in_64 out)
|
|
(uextend (def_inst (extractlane vec @ (value_type in)
|
|
(u8_from_uimm8 lane))))))
|
|
(value_reg (mov_from_vec (put_in_reg vec) lane (vector_size in))))
|
|
|
|
;; Atomic loads will also automatically zero their upper bits so the `uextend`
|
|
;; instruction can effectively get skipped here.
|
|
(rule (lower (has_type (fits_in_64 out)
|
|
(uextend (and (value_type in) (sinkable_atomic_load addr)))))
|
|
(value_reg (load_acquire in (sink_atomic_load addr))))
|
|
|
|
;; Conversion to 128-bit needs a zero-extension of the lower bits and the upper
|
|
;; bits are all zero.
|
|
(rule (lower (has_type $I128 (uextend x)))
|
|
(value_regs (put_in_reg_zext64 x) (imm $I64 0)))
|
|
|
|
;; Like above where vector extraction automatically zero-extends extending to
|
|
;; i128 only requires generating a 0 constant for the upper bits.
|
|
(rule (lower (has_type $I128
|
|
(uextend (def_inst (extractlane vec @ (value_type in)
|
|
(u8_from_uimm8 lane))))))
|
|
(value_regs (mov_from_vec (put_in_reg vec) lane (vector_size in)) (imm $I64 0)))
|
|
|
|
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; General rule for extending input to an output which fits in a single
|
|
;; register.
|
|
(rule (lower (has_type (fits_in_64 out) (sextend x @ (value_type in))))
|
|
(value_reg (extend (put_in_reg x) $true (ty_bits in) (ty_bits out))))
|
|
|
|
;; Extraction of a vector lane automatically extends as necessary, so we can
|
|
;; skip an explicit extending instruction.
|
|
(rule (lower (has_type (fits_in_64 out)
|
|
(sextend (def_inst (extractlane vec @ (value_type in)
|
|
(u8_from_uimm8 lane))))))
|
|
(value_reg (mov_from_vec_signed (put_in_reg vec)
|
|
lane
|
|
(vector_size in)
|
|
(size_from_ty out))))
|
|
|
|
;; 64-bit to 128-bit only needs to sign-extend the input to the upper bits.
|
|
(rule (lower (has_type $I128 (sextend x)))
|
|
(let (
|
|
(lo Reg (put_in_reg_sext64 x))
|
|
(hi Reg (alu_rr_imm_shift (ALUOp.Asr64) lo (imm_shift_from_u8 63)))
|
|
)
|
|
(value_regs lo hi)))
|
|
|
|
;; Like above where vector extraction automatically zero-extends extending to
|
|
;; i128 only requires generating a 0 constant for the upper bits.
|
|
;;
|
|
;; Note that `mov_from_vec_signed` doesn't exist for i64x2, so that's
|
|
;; specifically excluded here.
|
|
(rule (lower (has_type $I128
|
|
(sextend (def_inst (extractlane vec @ (value_type in @ (not_i64x2))
|
|
(u8_from_uimm8 lane))))))
|
|
(let (
|
|
(lo Reg (mov_from_vec_signed (put_in_reg vec)
|
|
lane
|
|
(vector_size in)
|
|
(size_from_ty $I64)))
|
|
(hi Reg (alu_rr_imm_shift (ALUOp.Asr64) lo (imm_shift_from_u8 63)))
|
|
)
|
|
(value_regs lo hi)))
|
|
|
|
;; Extension from an extraction of i64x2 into i128.
|
|
(rule (lower (has_type $I128
|
|
(sextend (def_inst (extractlane vec @ (value_type $I64X2)
|
|
(u8_from_uimm8 lane))))))
|
|
(let (
|
|
(lo Reg (mov_from_vec (put_in_reg vec)
|
|
lane
|
|
(VectorSize.Size64x2)))
|
|
(hi Reg (alu_rr_imm_shift (ALUOp.Asr64) lo (imm_shift_from_u8 63)))
|
|
)
|
|
(value_regs lo hi)))
|
|
|
|
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(decl orr_not_op (Type) ALUOp)
|
|
(rule (orr_not_op (fits_in_32 _ty)) (ALUOp.OrrNot32))
|
|
(rule (orr_not_op $I64) (ALUOp.OrrNot64))
|
|
|
|
;; Base case using `orn` between two registers.
|
|
;;
|
|
;; Note that bitwise negation is implemented here as
|
|
;;
|
|
;; NOT rd, rm ==> ORR_NOT rd, zero, rm
|
|
(rule (lower (has_type (fits_in_64 ty) (bnot x)))
|
|
(value_reg (alu_rrr (orr_not_op ty) (zero_reg) (put_in_reg x))))
|
|
|
|
;; Special case to use `AluRRRShift` if it's a `bnot` of a const-left-shifted
|
|
;; value.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bnot (def_inst (ishl x (def_inst (iconst (lshl_from_imm64 <ty amt))))))))
|
|
(value_reg (alu_rrr_shift (orr_not_op ty) (zero_reg) (put_in_reg x) amt)))
|
|
|
|
;; Implementation of `bnot` for `i128`.
|
|
(rule (lower (has_type $I128 (bnot x)))
|
|
(let (
|
|
(x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(new_lo Reg (alu_rrr (ALUOp.OrrNot64) (zero_reg) x_lo))
|
|
(new_hi Reg (alu_rrr (ALUOp.OrrNot64) (zero_reg) x_hi))
|
|
)
|
|
(value_regs new_lo new_hi)))
|
|
|
|
;; Implementation of `bnot` for vector types.
|
|
(rule (lower (has_type (vec128 ty) (bnot x)))
|
|
(value_reg (vec_misc (VecMisc2.Not) (put_in_reg x) (vector_size ty))))
|
|
|
|
;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (band x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.And32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (band x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.And64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (band x y))) (i128_alu_bitop (ALUOp.And64) x y))
|
|
|
|
(rule (lower (has_type (vec128 ty) (band x y)))
|
|
(value_reg (vec_rrr (VecALUOp.And) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (bor x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.Orr32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (bor x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.Orr64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (bor x y))) (i128_alu_bitop (ALUOp.Orr64) x y))
|
|
|
|
(rule (lower (has_type (vec128 ty) (bor x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Orr) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (bxor x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.Eor32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (bxor x y)))
|
|
(value_reg (alu_rs_imm_logic_commutative (ALUOp.Eor64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (bxor x y))) (i128_alu_bitop (ALUOp.Eor64) x y))
|
|
|
|
(rule (lower (has_type (vec128 ty) (bxor x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Eor) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (band_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.AndNot32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (band_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.AndNot64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (band_not x y))) (i128_alu_bitop (ALUOp.AndNot64) x y))
|
|
|
|
(rule (lower (has_type (vec128 ty) (band_not x y)))
|
|
(value_reg (vec_rrr (VecALUOp.Bic) (put_in_reg x) (put_in_reg y) (vector_size ty))))
|
|
|
|
;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (bor_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.OrrNot32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (bor_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.OrrNot64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (bor_not x y))) (i128_alu_bitop (ALUOp.OrrNot64) x y))
|
|
|
|
;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (fits_in_32 ty) (bxor_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.EorNot32) ty x y)))
|
|
|
|
(rule (lower (has_type $I64 (bxor_not x y)))
|
|
(value_reg (alu_rs_imm_logic (ALUOp.EorNot64) $I64 x y)))
|
|
|
|
(rule (lower (has_type $I128 (bxor_not x y))) (i128_alu_bitop (ALUOp.EorNot64) x y))
|
|
|
|
;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift for i8/i16/i32.
|
|
(rule (lower (has_type (fits_in_32 ty) (ishl x y)))
|
|
(value_reg (do_shift (ALUOp.Lsl32) ty (put_in_reg x) y)))
|
|
|
|
;; Shift for i64.
|
|
(rule (lower (has_type $I64 (ishl x y)))
|
|
(value_reg (do_shift (ALUOp.Lsl64) $I64 (put_in_reg x) y)))
|
|
|
|
;; Shift for i128.
|
|
(rule (lower (has_type $I128 (ishl x y)))
|
|
(lower_shl128 (put_in_regs x) (value_regs_get (put_in_regs y) 0)))
|
|
|
|
;; lsl lo_lshift, src_lo, amt
|
|
;; lsl hi_lshift, src_hi, amt
|
|
;; mvn inv_amt, amt
|
|
;; lsr lo_rshift, src_lo, #1
|
|
;; lsr lo_rshift, lo_rshift, inv_amt
|
|
;; orr maybe_hi, hi_lshift, lo_rshift
|
|
;; tst amt, #0x40
|
|
;; csel dst_hi, lo_lshift, maybe_hi, ne
|
|
;; csel dst_lo, xzr, lo_lshift, ne
|
|
(decl lower_shl128 (ValueRegs Reg) ValueRegs)
|
|
(rule (lower_shl128 src amt)
|
|
(let (
|
|
(src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
(lo_lshift Reg (alu_rrr (ALUOp.Lsl64) src_lo amt))
|
|
(hi_lshift Reg (alu_rrr (ALUOp.Lsl64) src_hi amt))
|
|
(inv_amt Reg (alu_rrr (ALUOp.OrrNot32) (zero_reg) amt))
|
|
(lo_rshift Reg (alu_rrr (ALUOp.Lsr64)
|
|
(alu_rr_imm_shift (ALUOp.Lsr64)
|
|
src_lo
|
|
(imm_shift_from_u8 1))
|
|
inv_amt))
|
|
(maybe_hi Reg (alu_rrr (ALUOp.Orr64) hi_lshift lo_rshift))
|
|
)
|
|
(with_flags_2
|
|
(tst64_imm amt (u64_into_imm_logic $I64 64))
|
|
(csel (Cond.Ne) (zero_reg) lo_lshift)
|
|
(csel (Cond.Ne) lo_lshift maybe_hi))))
|
|
|
|
;; Shift for vector types.
|
|
(rule (lower (has_type (vec128 ty) (ishl x y)))
|
|
(let (
|
|
(size VectorSize (vector_size ty))
|
|
(shift Reg (vec_dup (put_in_reg y) size))
|
|
)
|
|
(value_reg (vec_rrr (VecALUOp.Sshl) (put_in_reg x) shift size))))
|
|
|
|
;; Helper function to emit a shift operation with the opcode specified and
|
|
;; the output type specified. The `Reg` provided is shifted by the `Value`
|
|
;; given.
|
|
;;
|
|
;; Note that this automatically handles the clif semantics of masking the
|
|
;; shift amount where necessary.
|
|
(decl do_shift (ALUOp Type Reg Value) Reg)
|
|
|
|
;; 8/16-bit shift base case.
|
|
;;
|
|
;; When shifting for amounts larger than the size of the type, the CLIF shift
|
|
;; instructions implement a "wrapping" behaviour, such that an i8 << 8 is
|
|
;; equivalent to i8 << 0
|
|
;;
|
|
;; On i32 and i64 types this matches what the aarch64 spec does, but on smaller
|
|
;; types (i16, i8) we need to do this manually, so we wrap the shift amount
|
|
;; with an AND instruction
|
|
(rule (do_shift op (fits_in_16 ty) x y)
|
|
(let (
|
|
(shift_amt Reg (value_regs_get (put_in_regs y) 0))
|
|
(masked_shift_amt Reg (alu_rr_imm_logic (ALUOp.And32) shift_amt (shift_mask ty)))
|
|
)
|
|
(alu_rrr op x masked_shift_amt)))
|
|
|
|
(decl shift_mask (Type) ImmLogic)
|
|
(extern constructor shift_mask shift_mask)
|
|
|
|
;; 32/64-bit shift base cases.
|
|
(rule (do_shift op $I32 x y) (alu_rrr op x (value_regs_get (put_in_regs y) 0)))
|
|
(rule (do_shift op $I64 x y) (alu_rrr op x (value_regs_get (put_in_regs y) 0)))
|
|
|
|
;; Special case for shifting by a constant value where the value can fit into an
|
|
;; `ImmShift`.
|
|
;;
|
|
;; Note that this rule explicitly has a higher priority than the others
|
|
;; to ensure it's attempted first, otherwise the type-based filters on the
|
|
;; previous rules seem to take priority over this rule.
|
|
(rule 1 (do_shift op ty x (def_inst (iconst (imm_shift_from_imm64 <ty shift))))
|
|
(alu_rr_imm_shift op x shift))
|
|
|
|
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift for i8/i16/i32.
|
|
(rule (lower (has_type (fits_in_32 ty) (ushr x y)))
|
|
(value_reg (do_shift (ALUOp.Lsr32) ty (put_in_reg_zext32 x) y)))
|
|
|
|
;; Shift for i64.
|
|
(rule (lower (has_type $I64 (ushr x y)))
|
|
(value_reg (do_shift (ALUOp.Lsr64) $I64 (put_in_reg_zext64 x) y)))
|
|
|
|
;; Shift for i128.
|
|
(rule (lower (has_type $I128 (ushr x y)))
|
|
(lower_ushr128 (put_in_regs x) (value_regs_get (put_in_regs y) 0)))
|
|
|
|
;; Vector shifts.
|
|
(rule (lower (has_type (vec128 ty) (ushr x y)))
|
|
(let (
|
|
(size VectorSize (vector_size ty))
|
|
(shift Reg (vec_dup (alu_rrr (ALUOp.Sub32) (zero_reg) (put_in_reg y)) size))
|
|
)
|
|
(value_reg (vec_rrr (VecALUOp.Ushl) (put_in_reg x) shift size))))
|
|
|
|
;; lsr lo_rshift, src_lo, amt
|
|
;; lsr hi_rshift, src_hi, amt
|
|
;; mvn inv_amt, amt
|
|
;; lsl hi_lshift, src_hi, #1
|
|
;; lsl hi_lshift, hi_lshift, inv_amt
|
|
;; tst amt, #0x40
|
|
;; orr maybe_lo, lo_rshift, hi_lshift
|
|
;; csel dst_hi, xzr, hi_rshift, ne
|
|
;; csel dst_lo, hi_rshift, maybe_lo, ne
|
|
(decl lower_ushr128 (ValueRegs Reg) ValueRegs)
|
|
(rule (lower_ushr128 src amt)
|
|
(let (
|
|
(src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
(lo_rshift Reg (alu_rrr (ALUOp.Lsr64) src_lo amt))
|
|
(hi_rshift Reg (alu_rrr (ALUOp.Lsr64) src_hi amt))
|
|
|
|
(inv_amt Reg (alu_rrr (ALUOp.OrrNot32) (zero_reg) amt))
|
|
(hi_lshift Reg (alu_rrr (ALUOp.Lsl64)
|
|
(alu_rr_imm_shift (ALUOp.Lsl64)
|
|
src_hi
|
|
(imm_shift_from_u8 1))
|
|
inv_amt))
|
|
(maybe_lo Reg (alu_rrr (ALUOp.Orr64) lo_rshift hi_lshift))
|
|
)
|
|
(with_flags_2
|
|
(tst64_imm amt (u64_into_imm_logic $I64 64))
|
|
(csel (Cond.Ne) hi_rshift maybe_lo)
|
|
(csel (Cond.Ne) (zero_reg) hi_rshift))))
|
|
|
|
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Shift for i8/i16/i32.
|
|
(rule (lower (has_type (fits_in_32 ty) (sshr x y)))
|
|
(value_reg (do_shift (ALUOp.Asr32) ty (put_in_reg_sext32 x) y)))
|
|
|
|
;; Shift for i64.
|
|
(rule (lower (has_type $I64 (sshr x y)))
|
|
(value_reg (do_shift (ALUOp.Asr64) $I64 (put_in_reg_sext64 x) y)))
|
|
|
|
;; Shift for i128.
|
|
(rule (lower (has_type $I128 (sshr x y)))
|
|
(lower_sshr128 (put_in_regs x) (value_regs_get (put_in_regs y) 0)))
|
|
|
|
;; Vector shifts.
|
|
;;
|
|
;; Note that right shifts are implemented with a negative left shift.
|
|
(rule (lower (has_type (vec128 ty) (sshr x y)))
|
|
(let (
|
|
(size VectorSize (vector_size ty))
|
|
(shift Reg (vec_dup (alu_rrr (ALUOp.Sub32) (zero_reg) (put_in_reg y)) size))
|
|
)
|
|
(value_reg (vec_rrr (VecALUOp.Sshl) (put_in_reg x) shift size))))
|
|
|
|
;; lsr lo_rshift, src_lo, amt
|
|
;; asr hi_rshift, src_hi, amt
|
|
;; mvn inv_amt, amt
|
|
;; lsl hi_lshift, src_hi, #1
|
|
;; lsl hi_lshift, hi_lshift, inv_amt
|
|
;; asr hi_sign, src_hi, #63
|
|
;; orr maybe_lo, lo_rshift, hi_lshift
|
|
;; tst amt, #0x40
|
|
;; csel dst_hi, hi_sign, hi_rshift, ne
|
|
;; csel dst_lo, hi_rshift, maybe_lo, ne
|
|
(decl lower_sshr128 (ValueRegs Reg) ValueRegs)
|
|
(rule (lower_sshr128 src amt)
|
|
(let (
|
|
(src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
(lo_rshift Reg (alu_rrr (ALUOp.Lsr64) src_lo amt))
|
|
(hi_rshift Reg (alu_rrr (ALUOp.Asr64) src_hi amt))
|
|
|
|
(inv_amt Reg (alu_rrr (ALUOp.OrrNot32) (zero_reg) amt))
|
|
(hi_lshift Reg (alu_rrr (ALUOp.Lsl64)
|
|
(alu_rr_imm_shift (ALUOp.Lsl64)
|
|
src_hi
|
|
(imm_shift_from_u8 1))
|
|
inv_amt))
|
|
(hi_sign Reg (alu_rr_imm_shift (ALUOp.Asr64) src_hi (imm_shift_from_u8 63)))
|
|
(maybe_lo Reg (alu_rrr (ALUOp.Orr64) lo_rshift hi_lshift))
|
|
)
|
|
(with_flags_2
|
|
(tst64_imm amt (u64_into_imm_logic $I64 64))
|
|
(csel (Cond.Ne) hi_rshift maybe_lo)
|
|
(csel (Cond.Ne) hi_sign hi_rshift))))
|
|
|
|
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; General 8/16-bit case.
|
|
(rule (lower (has_type (fits_in_16 ty) (rotl x y)))
|
|
(let ((neg_shift Reg (alu_rrr (ALUOp.Sub32) (zero_reg) (put_in_reg y))))
|
|
(value_reg (small_rotr ty (put_in_reg_zext32 x) neg_shift))))
|
|
|
|
;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type (fits_in_16 ty) (rotl x (def_inst (iconst (imm_shift_from_imm64 <ty n))))))
|
|
(value_reg (small_rotr_imm ty (put_in_reg_zext32 x) (negate_imm_shift ty n))))
|
|
|
|
;; aarch64 doesn't have a left-rotate instruction, but a left rotation of K
|
|
;; places is effectively a right rotation of N - K places, if N is the integer's
|
|
;; bit size. We implement left rotations with this trick.
|
|
;;
|
|
;; Note that when negating the shift amount here the upper bits are ignored
|
|
;; by the rotr instruction, meaning that we'll still left-shift by the desired
|
|
;; amount.
|
|
|
|
;; General 32-bit case.
|
|
(rule (lower (has_type $I32 (rotl x y)))
|
|
(let ((neg_shift Reg (alu_rrr (ALUOp.Sub32) (zero_reg) (put_in_reg y))))
|
|
(value_reg (alu_rrr (ALUOp.RotR32) (put_in_reg x) neg_shift))))
|
|
|
|
;; General 64-bit case.
|
|
(rule (lower (has_type $I64 (rotl x y)))
|
|
(let ((neg_shift Reg (alu_rrr (ALUOp.Sub64) (zero_reg) (put_in_reg y))))
|
|
(value_reg (alu_rrr (ALUOp.RotR64) (put_in_reg x) neg_shift))))
|
|
|
|
;; Specialization for the 32-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type $I32 (rotl x (def_inst (iconst (imm_shift_from_imm64 <$I32 n))))))
|
|
(value_reg (alu_rr_imm_shift (ALUOp.RotR32) (put_in_reg x) (negate_imm_shift $I32 n))))
|
|
|
|
;; Specialization for the 64-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type $I64 (rotl x (def_inst (iconst (imm_shift_from_imm64 <$I64 n))))))
|
|
(value_reg (alu_rr_imm_shift (ALUOp.RotR64) (put_in_reg x) (negate_imm_shift $I64 n))))
|
|
|
|
(decl negate_imm_shift (Type ImmShift) ImmShift)
|
|
(extern constructor negate_imm_shift negate_imm_shift)
|
|
|
|
;; General 128-bit case.
|
|
;;
|
|
;; TODO: much better codegen is possible with a constant amount.
|
|
(rule (lower (has_type $I128 (rotl x y)))
|
|
(let (
|
|
(val ValueRegs (put_in_regs x))
|
|
(amt Reg (value_regs_get (put_in_regs y) 0))
|
|
(neg_amt Reg (alu_rrr (ALUOp.Sub64) (imm $I64 128) amt))
|
|
(lshift ValueRegs (lower_shl128 val amt))
|
|
(rshift ValueRegs (lower_ushr128 val neg_amt))
|
|
)
|
|
(value_regs
|
|
(alu_rrr (ALUOp.Orr64) (value_regs_get lshift 0) (value_regs_get rshift 0))
|
|
(alu_rrr (ALUOp.Orr64) (value_regs_get lshift 1) (value_regs_get rshift 1)))))
|
|
|
|
;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; General 8/16-bit case.
|
|
(rule (lower (has_type (fits_in_16 ty) (rotr x y)))
|
|
(value_reg (small_rotr ty (put_in_reg_zext32 x) (put_in_reg y))))
|
|
|
|
;; General 32-bit case.
|
|
(rule (lower (has_type $I32 (rotr x y)))
|
|
(value_reg (alu_rrr (ALUOp.RotR32) (put_in_reg x) (put_in_reg y))))
|
|
|
|
;; General 64-bit case.
|
|
(rule (lower (has_type $I64 (rotr x y)))
|
|
(value_reg (alu_rrr (ALUOp.RotR64) (put_in_reg x) (put_in_reg y))))
|
|
|
|
;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type (fits_in_16 ty) (rotr x (def_inst (iconst (imm_shift_from_imm64 <ty n))))))
|
|
(value_reg (small_rotr_imm ty (put_in_reg_zext32 x) n)))
|
|
|
|
;; Specialization for the 32-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type $I32 (rotr x (def_inst (iconst (imm_shift_from_imm64 <$I32 n))))))
|
|
(value_reg (alu_rr_imm_shift (ALUOp.RotR32) (put_in_reg x) n)))
|
|
|
|
;; Specialization for the 64-bit case when the rotation amount is an immediate.
|
|
(rule (lower (has_type $I64 (rotr x (def_inst (iconst (imm_shift_from_imm64 <$I64 n))))))
|
|
(value_reg (alu_rr_imm_shift (ALUOp.RotR64) (put_in_reg x) n)))
|
|
|
|
;; For a < 32-bit rotate-right, we synthesize this as:
|
|
;;
|
|
;; rotr rd, val, amt
|
|
;;
|
|
;; =>
|
|
;;
|
|
;; and masked_amt, amt, <bitwidth - 1>
|
|
;; sub tmp_sub, masked_amt, <bitwidth>
|
|
;; sub neg_amt, zero, tmp_sub ; neg
|
|
;; lsr val_rshift, val, masked_amt
|
|
;; lsl val_lshift, val, neg_amt
|
|
;; orr rd, val_lshift val_rshift
|
|
(decl small_rotr (Type Reg Reg) Reg)
|
|
(rule (small_rotr ty val amt)
|
|
(let (
|
|
(masked_amt Reg (alu_rr_imm_logic (ALUOp.And32) amt (rotr_mask ty)))
|
|
(tmp_sub Reg (alu_rr_imm12 (ALUOp.Sub32) masked_amt (u8_into_imm12 (ty_bits ty))))
|
|
(neg_amt Reg (alu_rrr (ALUOp.Sub32) (zero_reg) tmp_sub))
|
|
(val_rshift Reg (alu_rrr (ALUOp.Lsr32) val masked_amt))
|
|
(val_lshift Reg (alu_rrr (ALUOp.Lsl32) val neg_amt))
|
|
)
|
|
(alu_rrr (ALUOp.Orr32) val_lshift val_rshift)))
|
|
|
|
(decl rotr_mask (Type) ImmLogic)
|
|
(extern constructor rotr_mask rotr_mask)
|
|
|
|
;; For a constant amount, we can instead do:
|
|
;;
|
|
;; rotr rd, val, #amt
|
|
;;
|
|
;; =>
|
|
;;
|
|
;; lsr val_rshift, val, #<amt>
|
|
;; lsl val_lshift, val, <bitwidth - amt>
|
|
;; orr rd, val_lshift, val_rshift
|
|
(decl small_rotr_imm (Type Reg ImmShift) Reg)
|
|
(rule (small_rotr_imm ty val amt)
|
|
(let (
|
|
(val_rshift Reg (alu_rr_imm_shift (ALUOp.Lsr32) val amt))
|
|
(val_lshift Reg (alu_rr_imm_shift (ALUOp.Lsl32) val (rotr_opposite_amount ty amt)))
|
|
)
|
|
(alu_rrr (ALUOp.Orr32) val_lshift val_rshift)))
|
|
|
|
(decl rotr_opposite_amount (Type ImmShift) ImmShift)
|
|
(extern constructor rotr_opposite_amount rotr_opposite_amount)
|
|
|
|
;; General 128-bit case.
|
|
;;
|
|
;; TODO: much better codegen is possible with a constant amount.
|
|
(rule (lower (has_type $I128 (rotr x y)))
|
|
(let (
|
|
(val ValueRegs (put_in_regs x))
|
|
(amt Reg (value_regs_get (put_in_regs y) 0))
|
|
(neg_amt Reg (alu_rrr (ALUOp.Sub64) (imm $I64 128) amt))
|
|
(rshift ValueRegs (lower_ushr128 val amt))
|
|
(lshift ValueRegs (lower_shl128 val neg_amt))
|
|
(hi Reg (alu_rrr (ALUOp.Orr64) (value_regs_get rshift 1) (value_regs_get lshift 1)))
|
|
(lo Reg (alu_rrr (ALUOp.Orr64) (value_regs_get rshift 0) (value_regs_get lshift 0)))
|
|
)
|
|
(value_regs lo hi)))
|