aarch64: fix up regalloc2 semantics. (#4830)

This PR removes all uses of modify-operands in the aarch64 backend,
replacing them with reused-input operands instead. This has the nice
effect of removing a bunch of move instructions and more clearly
representing inputs and outputs.

This PR also removes the explicit use of pinned vregs in the aarch64
backend, instead using fixed-register constraints on the operands when
insts or pseudo-inst sequences require certain registers.

This is the second PR in the regalloc-semantics cleanup series; after
the remaining backend (s390x) and the ABI code are cleaned up as well,
we'll be able to simplify the regalloc2 frontend.
This commit is contained in:
Chris Fallin
2022-09-01 14:25:20 -07:00
committed by GitHub
parent ac2d4c4818
commit ae5fe8a728
25 changed files with 1098 additions and 886 deletions

View File

@@ -171,13 +171,23 @@
(rd WritableReg)
(rm PReg))
;; A MOV[Z,N,K] with a 16-bit immediate.
;; A MOV[Z,N] with a 16-bit immediate.
(MovWide
(op MoveWideOp)
(rd WritableReg)
(imm MoveWideConst)
(size OperandSize))
;; A MOVK with a 16-bit immediate. Modifies its register; we
;; model this with a seprate input `rn` and output `rd` virtual
;; register, with a regalloc constraint to tie them together.
(MovK
(rd WritableReg)
(rn Reg)
(imm MoveWideConst)
(size OperandSize))
;; A sign- or zero-extend operation.
(Extend
(rd WritableReg)
@@ -240,7 +250,12 @@
;; x28 (wr) scratch reg; value afterwards has no meaning
(AtomicRMWLoop
(ty Type) ;; I8, I16, I32 or I64
(op AtomicRMWLoopOp))
(op AtomicRMWLoopOp)
(addr Reg)
(operand Reg)
(oldval WritableReg)
(scratch1 WritableReg)
(scratch2 WritableReg))
;; Similar to AtomicRMWLoop, a compare-and-swap operation implemented using a load-linked
;; store-conditional loop, with acquire-release semantics.
@@ -253,7 +268,11 @@
;; x24 (wr) scratch reg; value afterwards has no meaning
(AtomicCASLoop
(ty Type) ;; I8, I16, I32 or I64
)
(addr Reg)
(expected Reg)
(replacement Reg)
(oldval WritableReg)
(scratch WritableReg))
;; An atomic read-modify-write operation. These instructions require the
;; Large System Extension (LSE) ISA support (FEAT_LSE). The instructions have
@@ -269,7 +288,10 @@
;; Large System Extension (LSE) ISA support (FEAT_LSE). The instructions have
;; acquire-release semantics.
(AtomicCAS
(rs WritableReg)
;; `rd` is really `rs` in the encoded instruction (so `rd` == `rs`); we separate
;; them here to have separate use and def vregs for regalloc.
(rd WritableReg)
(rs Reg)
(rt Reg)
(rn Reg)
(ty Type))
@@ -342,6 +364,16 @@
(rd WritableReg)
(rn Reg))
;; Variant of FpuRRI that modifies its `rd`, and so we name the
;; input state `ri` (for "input") and constrain the two
;; together.
(FpuRRIMod
(fpu_op FPUOpRIMod)
(rd WritableReg)
(ri Reg)
(rn Reg))
;; 3-op FPU instruction.
;; 16-bit scalars require half-precision floating-point support (FEAT_FP16).
(FpuRRRR
@@ -479,6 +511,7 @@
;; Move to a vector element from a GPR.
(MovToVec
(rd WritableReg)
(ri Reg)
(rn Reg)
(idx u8)
(size VectorSize))
@@ -534,6 +567,7 @@
;; Move vector element to another vector element.
(VecMovElement
(rd WritableReg)
(ri Reg)
(rn Reg)
(dest_idx u8)
(src_idx u8)
@@ -546,12 +580,19 @@
(rn Reg)
(high_half bool))
;; Vector narrowing operation.
(VecRRNarrow
;; Vector narrowing operation -- low half.
(VecRRNarrowLow
(op VecRRNarrowOp)
(rd WritableReg)
(rn Reg)
(high_half bool)
(lane_size ScalarSize))
;; Vector narrowing operation -- high half.
(VecRRNarrowHigh
(op VecRRNarrowOp)
(rd WritableReg)
(ri Reg)
(rn Reg)
(lane_size ScalarSize))
;; 1-operand vector instruction that operates on a pair of elements.
@@ -569,6 +610,17 @@
(rm Reg)
(high_half bool))
;; 2-operand vector instruction that produces a result with
;; twice the lane width and half the number of lanes. Variant
;; that modifies `rd` (so takes its initial state as `ri`).
(VecRRRLongMod
(alu_op VecRRRLongModOp)
(rd WritableReg)
(ri Reg)
(rn Reg)
(rm Reg)
(high_half bool))
;; 1-operand vector instruction that extends elements of the input
;; register and operates on a pair of elements. The output lane width
;; is double that of the input.
@@ -589,6 +641,7 @@
(VecRRRMod
(alu_op VecALUModOp)
(rd WritableReg)
(ri Reg)
(rn Reg)
(rm Reg)
(size VectorSize))
@@ -623,6 +676,7 @@
(VecShiftImmMod
(op VecShiftImmModOp)
(rd WritableReg)
(ri Reg)
(rn Reg)
(size VectorSize)
(imm u8))
@@ -635,29 +689,55 @@
(rm Reg)
(imm4 u8))
;; Table vector lookup - single register table. The table consists of 8-bit elements and is
;; stored in `rn`, while `rm` contains 8-bit element indices. `is_extension` specifies whether
;; to emit a TBX or a TBL instruction, i.e. whether to leave the elements in the destination
;; vector that correspond to out-of-range indices (greater than 15) unmodified or to set them
;; to 0.
;; Table vector lookup - single register table. The table
;; consists of 8-bit elements and is stored in `rn`, while `rm`
;; contains 8-bit element indices. This variant emits `TBL`,
;; which sets elements that correspond to out-of-range indices
;; (greater than 15) to 0.
(VecTbl
(rd WritableReg)
(rn Reg)
(rm Reg)
(is_extension bool))
(rm Reg))
;; Table vector lookup - two register table. The table consists of 8-bit elements and is
;; stored in `rn` and `rn2`, while `rm` contains 8-bit element indices. `is_extension`
;; specifies whether to emit a TBX or a TBL instruction, i.e. whether to leave the elements in
;; the destination vector that correspond to out-of-range indices (greater than 31) unmodified
;; or to set them to 0. The table registers `rn` and `rn2` must have consecutive numbers
;; modulo 32, that is v31 and v0 (in that order) are consecutive registers.
;; Table vector lookup - single register table. The table
;; consists of 8-bit elements and is stored in `rn`, while `rm`
;; contains 8-bit element indices. This variant emits `TBX`,
;; which leaves elements that correspond to out-of-range indices
;; (greater than 15) unmodified. Hence, it takes an input vreg in
;; `ri` that is constrained to the same allocation as `rd`.
(VecTblExt
(rd WritableReg)
(ri Reg)
(rn Reg)
(rm Reg))
;; Table vector lookup - two register table. The table consists
;; of 8-bit elements and is stored in `rn` and `rn2`, while
;; `rm` contains 8-bit element indices. The table registers
;; `rn` and `rn2` must have consecutive numbers modulo 32, that
;; is v31 and v0 (in that order) are consecutive registers.
;; This variant emits `TBL`, which sets out-of-range results to
;; 0.
(VecTbl2
(rd WritableReg)
(rn Reg)
(rn2 Reg)
(rm Reg)
(is_extension bool))
(rm Reg))
;; Table vector lookup - two register table. The table consists
;; of 8-bit elements and is stored in `rn` and `rn2`, while
;; `rm` contains 8-bit element indices. The table registers
;; `rn` and `rn2` must have consecutive numbers modulo 32, that
;; is v31 and v0 (in that order) are consecutive registers.
;; This variant emits `TBX`, which leaves out-of-range results
;; unmodified, hence takes the initial state of the result
;; register in vreg `ri`.
(VecTbl2Ext
(rd WritableReg)
(ri Reg)
(rn Reg)
(rn2 Reg)
(rm Reg))
;; Load an element and replicate to all lanes of a vector.
(VecLoadReplicate
@@ -888,7 +968,6 @@
(enum
(MovZ)
(MovN)
(MovK)
))
(type UImm5 (primitive UImm5))
@@ -934,6 +1013,7 @@
(type AMode extern (enum))
(type PairAMode extern (enum))
(type FPUOpRI extern (enum))
(type FPUOpRIMod extern (enum))
(type OperandSize extern
(enum Size32
@@ -1287,6 +1367,10 @@
(Umull8)
(Umull16)
(Umull32)
))
(type VecRRRLongModOp
(enum
;; Unsigned multiply add long
(Umlal8)
(Umlal16)
@@ -1447,9 +1531,9 @@
(decl fpu_op_ri_ushr (u8 u8) FPUOpRI)
(extern constructor fpu_op_ri_ushr fpu_op_ri_ushr)
;; Constructs an FPUOpRI.Sli* given the size in bits of the value (or lane)
;; Constructs an FPUOpRIMod.Sli* given the size in bits of the value (or lane)
;; and the amount to shift by.
(decl fpu_op_ri_sli (u8 u8) FPUOpRI)
(decl fpu_op_ri_sli (u8 u8) FPUOpRIMod)
(extern constructor fpu_op_ri_sli fpu_op_ri_sli)
(decl imm12_from_negated_u64 (Imm12) u64)
@@ -1524,29 +1608,6 @@
(decl writable_zero_reg () WritableReg)
(extern constructor writable_zero_reg writable_zero_reg)
;; Helpers for getting a particular real register
(decl xreg (u8) Reg)
(extern constructor xreg xreg)
(decl writable_vreg (u8) WritableReg)
(extern constructor writable_vreg writable_vreg)
(decl writable_xreg (u8) WritableReg)
(extern constructor writable_xreg writable_xreg)
;; Helper for emitting `MInst.Mov64` instructions.
(decl mov64_to_real (u8 Reg) Reg)
(rule (mov64_to_real num src)
(let ((dst WritableReg (writable_xreg num))
(_ Unit (emit (MInst.Mov (operand_size $I64) dst src))))
dst))
(decl mov64_from_real (u8) Reg)
(rule (mov64_from_real num)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.Mov (operand_size $I64) dst (xreg num)))))
dst))
;; Helper for emitting `MInst.MovZ` instructions.
(decl movz (MoveWideConst OperandSize) Reg)
(rule (movz imm size)
@@ -1601,8 +1662,7 @@
(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
(rule (vec_rrr_mod op src1 src2 src3 size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_1 Unit (emit (MInst.FpuMove128 dst src1)))
(_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
(_1 Unit (emit (MInst.VecRRRMod op dst src1 src2 src3 size))))
dst))
(decl fpu_rri (FPUOpRI Reg) Reg)
@@ -1611,6 +1671,12 @@
(_ Unit (emit (MInst.FpuRRI op dst src))))
dst))
(decl fpu_rri_mod (FPUOpRIMod Reg Reg) Reg)
(rule (fpu_rri_mod op dst_src src)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuRRIMod op dst dst_src src))))
dst))
;; Helper for emitting `MInst.FpuRRR` instructions.
(decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg)
(rule (fpu_rrr op src1 src2 size)
@@ -1790,29 +1856,33 @@
dst))
;; Helper for emitting `MInst.VecTbl` instructions.
(decl vec_tbl (Reg Reg bool) Reg)
(rule (vec_tbl rn rm is_extension)
(decl vec_tbl (Reg Reg) Reg)
(rule (vec_tbl rn rm)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecTbl dst rn rm is_extension))))
(_ Unit (emit (MInst.VecTbl dst rn rm))))
dst))
(decl vec_tbl_ext (Reg Reg Reg) Reg)
(rule (vec_tbl_ext ri rn rm)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecTblExt dst ri rn rm))))
dst))
;; Helper for emitting `MInst.VecTbl2` instructions.
;; - 2 register table vector lookups require consecutive table registers;
;; we satisfy this constraint by hardcoding the usage of v30 and v31.
;; - Make sure that both args are in virtual regs, since it is not guaranteed
;; that we can get them safely to the temporaries if either is in a real
;; register.
(decl vec_tbl2 (Reg Reg Reg bool Type) Reg)
(rule (vec_tbl2 rn rn2 rm is_extension ty)
(decl vec_tbl2 (Reg Reg Reg Type) Reg)
(rule (vec_tbl2 rn rn2 rm ty)
(let (
(temp WritableReg (writable_vreg 30))
(temp2 WritableReg (writable_vreg 31))
(dst WritableReg (temp_writable_reg $I8X16))
(rn Reg (ensure_in_vreg rn ty))
(rn2 Reg (ensure_in_vreg rn2 ty))
(_ Unit (emit (MInst.FpuMove128 temp rn)))
(_ Unit (emit (MInst.FpuMove128 temp2 rn2)))
(_ Unit (emit (MInst.VecTbl2 dst temp temp2 rm is_extension)))
(_ Unit (emit (MInst.VecTbl2 dst rn rn2 rm)))
)
dst))
;; Helper for emitting `MInst.VecTbl2Ext` instructions.
(decl vec_tbl2_ext (Reg Reg Reg Reg Type) Reg)
(rule (vec_tbl2_ext ri rn rn2 rm ty)
(let (
(dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecTbl2Ext dst ri rn rn2 rm)))
)
dst))
@@ -1830,22 +1900,18 @@
(_ Unit (emit (MInst.VecRRPairLong op dst src))))
dst))
;; Helper for emitting `MInst.VecRRRLong` instructions, but for variants
;; where the operation both reads and modifies the destination register.
;;
;; Currently this is only used for `VecRRRLongOp.Umlal*`
(decl vec_rrrr_long (VecRRRLongOp Reg Reg Reg bool) Reg)
;; Helper for emitting `MInst.VecRRRLongMod` instructions.
(decl vec_rrrr_long (VecRRRLongModOp Reg Reg Reg bool) Reg)
(rule (vec_rrrr_long op src1 src2 src3 high_half)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.FpuMove128 dst src1)))
(_ Unit (emit (MInst.VecRRRLong op dst src2 src3 high_half))))
(_ Unit (emit (MInst.VecRRRLongMod op dst src1 src2 src3 high_half))))
dst))
;; Helper for emitting `MInst.VecRRNarrow` instructions.
(decl vec_rr_narrow (VecRRNarrowOp Reg ScalarSize) Reg)
(rule (vec_rr_narrow op src size)
(decl vec_rr_narrow_low (VecRRNarrowOp Reg ScalarSize) Reg)
(rule (vec_rr_narrow_low op src size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecRRNarrow op dst src $false size))))
(_ Unit (emit (MInst.VecRRNarrowLow op dst src size))))
dst))
;; Helper for emitting `MInst.VecRRNarrow` instructions which update the
@@ -1853,8 +1919,7 @@
(decl vec_rr_narrow_high (VecRRNarrowOp Reg Reg ScalarSize) Reg)
(rule (vec_rr_narrow_high op mod src size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.FpuMove128 dst mod)))
(_ Unit (emit (MInst.VecRRNarrow op dst src $true size))))
(_ Unit (emit (MInst.VecRRNarrowHigh op dst mod src size))))
dst))
;; Helper for emitting `MInst.VecRRLong` instructions.
@@ -1897,16 +1962,14 @@
(decl mov_to_vec (Reg Reg u8 VectorSize) Reg)
(rule (mov_to_vec src1 src2 lane size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.FpuMove128 dst src1)))
(_ Unit (emit (MInst.MovToVec dst src2 lane size))))
(_ Unit (emit (MInst.MovToVec dst src1 src2 lane size))))
dst))
;; Helper for emitting `MInst.VecMovElement` instructions.
(decl mov_vec_elem (Reg Reg u8 u8 VectorSize) Reg)
(rule (mov_vec_elem src1 src2 dst_idx src_idx size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.FpuMove128 dst src1)))
(_ Unit (emit (MInst.VecMovElement dst src2 dst_idx src_idx size))))
(_ Unit (emit (MInst.VecMovElement dst src1 src2 dst_idx src_idx size))))
dst))
;; Helper for emitting `MInst.MovFromVec` instructions.
@@ -2104,15 +2167,15 @@
;; Helper for generating `xtn` instructions.
(decl xtn (Reg ScalarSize) Reg)
(rule (xtn x size) (vec_rr_narrow (VecRRNarrowOp.Xtn) x size))
(rule (xtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Xtn) x size))
;; Helper for generating `fcvtn` instructions.
(decl fcvtn (Reg ScalarSize) Reg)
(rule (fcvtn x size) (vec_rr_narrow (VecRRNarrowOp.Fcvtn) x size))
(rule (fcvtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Fcvtn) x size))
;; Helper for generating `sqxtn` instructions.
(decl sqxtn (Reg ScalarSize) Reg)
(rule (sqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtn) x size))
(rule (sqxtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Sqxtn) x size))
;; Helper for generating `sqxtn2` instructions.
(decl sqxtn2 (Reg Reg ScalarSize) Reg)
@@ -2120,7 +2183,7 @@
;; Helper for generating `sqxtun` instructions.
(decl sqxtun (Reg ScalarSize) Reg)
(rule (sqxtun x size) (vec_rr_narrow (VecRRNarrowOp.Sqxtun) x size))
(rule (sqxtun x size) (vec_rr_narrow_low (VecRRNarrowOp.Sqxtun) x size))
;; Helper for generating `sqxtun2` instructions.
(decl sqxtun2 (Reg Reg ScalarSize) Reg)
@@ -2128,7 +2191,7 @@
;; Helper for generating `uqxtn` instructions.
(decl uqxtn (Reg ScalarSize) Reg)
(rule (uqxtn x size) (vec_rr_narrow (VecRRNarrowOp.Uqxtn) x size))
(rule (uqxtn x size) (vec_rr_narrow_low (VecRRNarrowOp.Uqxtn) x size))
;; Helper for generating `uqxtn2` instructions.
(decl uqxtn2 (Reg Reg ScalarSize) Reg)
@@ -2187,7 +2250,7 @@
;; Helper for generating `umlal32` instructions.
(decl umlal32 (Reg Reg Reg bool) Reg)
(rule (umlal32 x y z high_half) (vec_rrrr_long (VecRRRLongOp.Umlal32) x y z high_half))
(rule (umlal32 x y z high_half) (vec_rrrr_long (VecRRRLongModOp.Umlal32) x y z high_half))
;; Helper for generating `smull8` instructions.
(decl smull8 (Reg Reg bool) Reg)
@@ -2719,8 +2782,7 @@
(rule (lse_atomic_cas addr expect replace ty)
(let (
(dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.Mov (operand_size ty) dst expect)))
(_ Unit (emit (MInst.AtomicCAS dst replace addr ty)))
(_ Unit (emit (MInst.AtomicCAS dst expect replace addr ty)))
)
dst))
@@ -2730,16 +2792,13 @@
;; regs, and that's not guaranteed safe if either is in a real reg.
;; - Move the args to the preordained AtomicRMW input regs
;; - And finally, copy the preordained AtomicRMW output reg to its destination.
(decl atomic_rmw_loop (AtomicRMWLoopOp Value Value Type) Reg)
(rule (atomic_rmw_loop op p arg2 ty)
(let (
(v_addr Reg (ensure_in_vreg p $I64))
(v_arg2 Reg (ensure_in_vreg arg2 $I64))
(r_addr Reg (mov64_to_real 25 v_addr))
(r_arg2 Reg (mov64_to_real 26 v_arg2))
(_ Unit (emit (MInst.AtomicRMWLoop ty op)))
)
(mov64_from_real 27)))
(decl atomic_rmw_loop (AtomicRMWLoopOp Reg Reg Type) Reg)
(rule (atomic_rmw_loop op addr operand ty)
(let ((dst WritableReg (temp_writable_reg $I64))
(scratch1 WritableReg (temp_writable_reg $I64))
(scratch2 WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.AtomicRMWLoop ty op addr operand dst scratch1 scratch2))))
dst))
;; Helper for emitting `MInst.AtomicCASLoop` instructions.
;; This is very similar to, but not identical to, the AtomicRmw case. Note
@@ -2749,21 +2808,10 @@
;; for `atomic_rmw_loop` above.
(decl atomic_cas_loop (Reg Reg Reg Type) Reg)
(rule (atomic_cas_loop addr expect replace ty)
(let (
(v_addr Reg (ensure_in_vreg addr $I64))
(v_exp Reg (ensure_in_vreg expect $I64))
(v_rep Reg (ensure_in_vreg replace $I64))
;; Move the args to the preordained AtomicCASLoop input regs
(r_addr Reg (mov64_to_real 25 v_addr))
(r_exp Reg (mov64_to_real 26 v_exp))
(r_rep Reg (mov64_to_real 28 v_rep))
;; Now the AtomicCASLoop itself, implemented in the normal way, with a
;; load-exclusive, store-exclusive loop
(_ Unit (emit (MInst.AtomicCASLoop ty)))
)
;; And finally, copy the preordained AtomicCASLoop output reg to its destination.
;; Also, x24 and x28 are trashed.
(mov64_from_real 27)))
(let ((dst WritableReg (temp_writable_reg $I64))
(scratch WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.AtomicCASLoop ty addr expect replace dst scratch))))
dst))
;; Helper for emitting `MInst.MovPReg` instructions.
(decl mov_preg (PReg) Reg)
@@ -2811,15 +2859,13 @@
(decl fcopy_sign (Reg Reg Type) Reg)
(rule (fcopy_sign x y (ty_scalar_float ty))
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuMove64 dst x)))
(tmp Reg (fpu_rri (fpu_op_ri_ushr (ty_bits ty) (max_shift ty)) y))
(_ Unit (emit (MInst.FpuRRI (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst tmp))))
(_ Unit (emit (MInst.FpuRRIMod (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst x tmp))))
dst))
(rule (fcopy_sign x y ty @ (multi_lane _ _))
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.FpuMove128 dst x)))
(tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty)))
(_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst tmp (vector_size ty) (max_shift (lane_type ty))))))
(_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst x tmp (vector_size ty) (max_shift (lane_type ty))))))
dst))
;; Helpers for generating `MInst.FpuToInt` instructions.