From d617d5e0f3664b6129ed0aba96d18e411f931678 Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Wed, 13 Dec 2017 09:22:44 -0600 Subject: [PATCH] Use a domtree pre-order instead of a CFG RPO for coalescing. The stack implementation if the Budimlic dominator forest doesn't work correctly with a CFG RPO. It needs the domtree pre-order. Also handle EBB pre-order vs inst-level preorder. Manage the stack according to EBB dominance. Look for a dominating value by searching the stack. This is different from the Budimlic algorithm because we're computing the dominator tree pre-order with EBB granularity only. Fixes #207. --- .../filetests/regalloc/coalescing-207.cton | 1263 +++++++++++++++++ lib/cretonne/src/regalloc/coalescing.rs | 113 +- 2 files changed, 1343 insertions(+), 33 deletions(-) create mode 100644 cranelift/filetests/regalloc/coalescing-207.cton diff --git a/cranelift/filetests/regalloc/coalescing-207.cton b/cranelift/filetests/regalloc/coalescing-207.cton new file mode 100644 index 0000000000..9485c89f36 --- /dev/null +++ b/cranelift/filetests/regalloc/coalescing-207.cton @@ -0,0 +1,1263 @@ +test regalloc +set is_64bit +isa intel haswell + +; Reported as https://github.com/stoklund/cretonne/issues/207 +; +; The coalescer creates a virtual register with two interfering values. +function %pr207(i64 vmctx, i32, i32) -> i32 native { + gv0 = vmctx-8 + heap0 = static gv0, min 0, bound 0x5000, guard 0x0040_0000 + sig0 = (i64 vmctx, i32, i32) -> i32 native + sig1 = (i64 vmctx, i32, i32, i32) -> i32 native + sig2 = (i64 vmctx, i32, i32, i32) -> i32 native + fn0 = sig0 u0:2 + fn1 = sig1 u0:0 + fn2 = sig2 u0:1 + +ebb0(v0: i64, v1: i32, v2: i32): + v3 = iconst.i32 0 + v4 = iconst.i32 0 + v5 = iconst.i32 0 + v6 = iconst.i32 0x4ffe + v7 = icmp uge v5, v6 + brz v7, ebb1 + trap heap_oob + +ebb1: + v8 = uextend.i64 v5 + v9 = iadd_imm.i64 v0, -8 + v10 = load.i64 v9 + v11 = iadd v10, v8 + v12 = load.i32 v11+4 + v13 = iconst.i32 1056 + v14 = isub v12, v13 + v15 = iconst.i32 0x4ffe + v16 = icmp.i32 uge v4, v15 + brz v16, ebb2 + trap heap_oob + +ebb2: + v17 = uextend.i64 v4 + v18 = iadd_imm.i64 v0, -8 + v19 = load.i64 v18 + v20 = iadd v19, v17 + store.i32 v14, v20+4 + v21 = iconst.i32 0x4ffe + v22 = icmp.i32 uge v2, v21 + brz v22, ebb3 + trap heap_oob + +ebb3: + v23 = uextend.i64 v2 + v24 = iadd_imm.i64 v0, -8 + v25 = load.i64 v24 + v26 = iadd v25, v23 + v27 = sload8.i32 v26 + v28 = iconst.i32 255 + v29 = band v27, v28 + v30 = iconst.i32 0 + v31 = icmp eq v29, v30 + v32 = bint.i32 v31 + brnz v32, ebb90(v14, v1) + v33 = call fn0(v0, v1, v27) + v34 = iconst.i32 0 + v35 = iconst.i32 0 + v36 = icmp eq v33, v35 + v37 = bint.i32 v36 + brnz v37, ebb90(v14, v34) + v38 = iconst.i32 0x4ffe + v39 = icmp.i32 uge v2, v38 + brz v39, ebb4 + trap heap_oob + +ebb4: + v40 = uextend.i64 v2 + v41 = iadd_imm.i64 v0, -8 + v42 = load.i64 v41 + v43 = iadd v42, v40 + v44 = uload8.i32 v43+1 + v45 = iconst.i32 0 + v46 = icmp eq v44, v45 + v47 = bint.i32 v46 + brnz v47, ebb56(v33, v14) + v48 = iconst.i32 0x4ffe + v49 = icmp.i32 uge v33, v48 + brz v49, ebb5 + trap heap_oob + +ebb5: + v50 = uextend.i64 v33 + v51 = iadd_imm.i64 v0, -8 + v52 = load.i64 v51 + v53 = iadd v52, v50 + v54 = uload8.i32 v53+1 + v55 = iconst.i32 0 + v56 = icmp eq v54, v55 + v57 = bint.i32 v56 + brnz v57, ebb90(v14, v34) + v58 = iconst.i32 0x4ffe + v59 = icmp.i32 uge v2, v58 + brz v59, ebb6 + trap heap_oob + +ebb6: + v60 = uextend.i64 v2 + v61 = iadd_imm.i64 v0, -8 + v62 = load.i64 v61 + v63 = iadd v62, v60 + v64 = uload8.i32 v63+2 + v65 = iconst.i32 0 + v66 = icmp eq v64, v65 + v67 = bint.i32 v66 + brnz v67, ebb42 + v68 = iconst.i32 0x4ffe + v69 = icmp.i32 uge v33, v68 + brz v69, ebb7 + trap heap_oob + +ebb7: + v70 = uextend.i64 v33 + v71 = iadd_imm.i64 v0, -8 + v72 = load.i64 v71 + v73 = iadd v72, v70 + v74 = uload8.i32 v73+2 + v75 = iconst.i32 0 + v76 = icmp eq v74, v75 + v77 = bint.i32 v76 + brnz v77, ebb90(v14, v34) + v78 = iconst.i32 0x4ffe + v79 = icmp.i32 uge v2, v78 + brz v79, ebb8 + trap heap_oob + +ebb8: + v80 = uextend.i64 v2 + v81 = iadd_imm.i64 v0, -8 + v82 = load.i64 v81 + v83 = iadd v82, v80 + v84 = uload8.i32 v83+3 + v85 = iconst.i32 0 + v86 = icmp eq v84, v85 + v87 = bint.i32 v86 + brnz v87, ebb46 + v88 = iconst.i32 0x4ffe + v89 = icmp.i32 uge v33, v88 + brz v89, ebb9 + trap heap_oob + +ebb9: + v90 = uextend.i64 v33 + v91 = iadd_imm.i64 v0, -8 + v92 = load.i64 v91 + v93 = iadd v92, v90 + v94 = uload8.i32 v93+3 + v95 = iconst.i32 0 + v96 = icmp eq v94, v95 + v97 = bint.i32 v96 + brnz v97, ebb90(v14, v34) + v98 = iconst.i32 0x4ffe + v99 = icmp.i32 uge v2, v98 + brz v99, ebb10 + trap heap_oob + +ebb10: + v100 = uextend.i64 v2 + v101 = iadd_imm.i64 v0, -8 + v102 = load.i64 v101 + v103 = iadd v102, v100 + v104 = uload8.i32 v103+4 + v105 = iconst.i32 0 + v106 = icmp eq v104, v105 + v107 = bint.i32 v106 + brnz v107, ebb54 + v108 = iconst.i32 1 + v109 = iadd.i32 v2, v108 + v110 = iconst.i32 1048 + v111 = iadd.i32 v14, v110 + v112 = iconst.i64 0 + v113 = iconst.i32 0x4ffe + v114 = icmp uge v111, v113 + brz v114, ebb11 + trap heap_oob + +ebb11: + v115 = uextend.i64 v111 + v116 = iadd_imm.i64 v0, -8 + v117 = load.i64 v116 + v118 = iadd v117, v115 + store.i64 v112, v118 + v119 = iconst.i32 1040 + v120 = iadd.i32 v14, v119 + v121 = iconst.i64 0 + v122 = iconst.i32 0x4ffe + v123 = icmp uge v120, v122 + brz v123, ebb12 + trap heap_oob + +ebb12: + v124 = uextend.i64 v120 + v125 = iadd_imm.i64 v0, -8 + v126 = load.i64 v125 + v127 = iadd v126, v124 + store.i64 v121, v127 + v128 = iconst.i64 0 + v129 = iconst.i32 0x4ffe + v130 = icmp.i32 uge v14, v129 + brz v130, ebb13 + trap heap_oob + +ebb13: + v131 = uextend.i64 v14 + v132 = iadd_imm.i64 v0, -8 + v133 = load.i64 v132 + v134 = iadd v133, v131 + store.i64 v128, v134+1032 + v135 = iconst.i64 0 + v136 = iconst.i32 0x4ffe + v137 = icmp.i32 uge v14, v136 + brz v137, ebb14 + trap heap_oob + +ebb14: + v138 = uextend.i64 v14 + v139 = iadd_imm.i64 v0, -8 + v140 = load.i64 v139 + v141 = iadd v140, v138 + store.i64 v135, v141+1024 + v142 = iconst.i32 -1 + jump ebb15(v142, v27) + +ebb15(v143: i32, v144: i32): + v145 = iadd.i32 v33, v143 + v146 = iconst.i32 1 + v147 = iadd v145, v146 + v148 = iconst.i32 0x4ffe + v149 = icmp uge v147, v148 + brz v149, ebb16 + trap heap_oob + +ebb16: + v150 = uextend.i64 v147 + v151 = iadd_imm.i64 v0, -8 + v152 = load.i64 v151 + v153 = iadd v152, v150 + v154 = uload8.i32 v153 + v155 = iconst.i32 0 + v156 = icmp eq v154, v155 + v157 = bint.i32 v156 + brnz v157, ebb89(v14) + v158 = iconst.i32 255 + v159 = band.i32 v144, v158 + v160 = iconst.i32 2 + v161 = ishl v159, v160 + v162 = iadd.i32 v14, v161 + v163 = iconst.i32 2 + v164 = iadd.i32 v143, v163 + v165 = iconst.i32 0x4ffe + v166 = icmp uge v162, v165 + brz v166, ebb17 + trap heap_oob + +ebb17: + v167 = uextend.i64 v162 + v168 = iadd_imm.i64 v0, -8 + v169 = load.i64 v168 + v170 = iadd v169, v167 + store.i32 v164, v170 + v171 = iconst.i32 1024 + v172 = iadd.i32 v14, v171 + v173 = iconst.i32 3 + v174 = ushr.i32 v159, v173 + v175 = iconst.i32 28 + v176 = band v174, v175 + v177 = iadd v172, v176 + v178 = iconst.i32 0x4ffe + v179 = icmp uge v177, v178 + brz v179, ebb18 + trap heap_oob + +ebb18: + v180 = uextend.i64 v177 + v181 = iadd_imm.i64 v0, -8 + v182 = load.i64 v181 + v183 = iadd v182, v180 + v184 = load.i32 v183 + v185 = iconst.i32 1 + v186 = iconst.i32 31 + v187 = band.i32 v144, v186 + v188 = ishl v185, v187 + v189 = bor v184, v188 + v190 = iconst.i32 0x4ffe + v191 = icmp.i32 uge v177, v190 + brz v191, ebb19 + trap heap_oob + +ebb19: + v192 = uextend.i64 v177 + v193 = iadd_imm.i64 v0, -8 + v194 = load.i64 v193 + v195 = iadd v194, v192 + store.i32 v189, v195 + v196 = iadd.i32 v109, v143 + v197 = iconst.i32 1 + v198 = iadd.i32 v143, v197 + v199 = iconst.i32 1 + v200 = iadd v196, v199 + v201 = iconst.i32 0x4ffe + v202 = icmp uge v200, v201 + brz v202, ebb20 + trap heap_oob + +ebb20: + v203 = uextend.i64 v200 + v204 = iadd_imm.i64 v0, -8 + v205 = load.i64 v204 + v206 = iadd v205, v203 + v207 = uload8.i32 v206 + brnz v207, ebb15(v198, v207) + jump ebb21 + +ebb21: + v208 = iconst.i32 -1 + v209 = iconst.i32 1 + v210 = iconst.i32 -1 + v211 = iconst.i32 1 + v212 = iconst.i32 1 + v213 = iadd.i32 v198, v212 + v214 = iconst.i32 2 + v215 = icmp ult v213, v214 + v216 = bint.i32 v215 + brnz v216, ebb38(v2, v211, v209, v210, v208, v198, v213, v33, v14) + v217 = iconst.i32 -1 + v218 = iconst.i32 0 + v219 = iconst.i32 1 + v220 = iconst.i32 1 + v221 = iconst.i32 1 + v222 = copy.i32 v44 + jump ebb22(v217, v221, v44, v220, v218, v219, v213, v222, v198, v33, v14) + +ebb22(v223: i32, v224: i32, v225: i32, v226: i32, v227: i32, v228: i32, v229: i32, v230: i32, v231: i32, v232: i32, v233: i32): + v234 = copy v228 + v235 = iadd v223, v224 + v236 = iadd.i32 v2, v235 + v237 = iconst.i32 0x4ffe + v238 = icmp uge v236, v237 + brz v238, ebb23 + trap heap_oob + +ebb23: + v239 = uextend.i64 v236 + v240 = iadd_imm.i64 v0, -8 + v241 = load.i64 v240 + v242 = iadd v241, v239 + v243 = uload8.i32 v242 + v244 = iconst.i32 255 + v245 = band.i32 v225, v244 + v246 = icmp ne v243, v245 + v247 = bint.i32 v246 + brnz v247, ebb24 + v248 = icmp.i32 ne v224, v226 + v249 = bint.i32 v248 + brnz v249, ebb25 + v250 = iadd.i32 v227, v226 + v251 = iconst.i32 1 + jump ebb27(v251, v250, v223, v226) + +ebb24: + v252 = icmp.i32 ule v243, v245 + v253 = bint.i32 v252 + brnz v253, ebb26 + v254 = isub.i32 v234, v223 + v255 = iconst.i32 1 + jump ebb27(v255, v234, v223, v254) + +ebb25: + v256 = iconst.i32 1 + v257 = iadd.i32 v224, v256 + v258 = copy.i32 v227 + jump ebb27(v257, v258, v223, v226) + +ebb26: + v259 = iconst.i32 1 + v260 = iconst.i32 1 + v261 = iadd.i32 v227, v260 + v262 = iconst.i32 1 + v263 = copy.i32 v227 + jump ebb27(v259, v261, v263, v262) + +ebb27(v264: i32, v265: i32, v266: i32, v267: i32): + v268 = iadd v264, v265 + v269 = icmp uge v268, v229 + v270 = bint.i32 v269 + brnz v270, ebb29 + v271 = iadd.i32 v2, v268 + v272 = iconst.i32 0x4ffe + v273 = icmp uge v271, v272 + brz v273, ebb28 + trap heap_oob + +ebb28: + v274 = uextend.i64 v271 + v275 = iadd_imm.i64 v0, -8 + v276 = load.i64 v275 + v277 = iadd v276, v274 + v278 = uload8.i32 v277 + v279 = copy.i32 v265 + jump ebb22(v266, v264, v278, v267, v279, v268, v229, v230, v231, v232, v233) + +ebb29: + jump ebb30 + +ebb30: + v280 = iconst.i32 -1 + v281 = iconst.i32 0 + v282 = iconst.i32 1 + v283 = iconst.i32 1 + v284 = iconst.i32 1 + jump ebb31(v280, v284, v230, v283, v281, v282, v229, v267, v266, v231, v232, v233) + +ebb31(v285: i32, v286: i32, v287: i32, v288: i32, v289: i32, v290: i32, v291: i32, v292: i32, v293: i32, v294: i32, v295: i32, v296: i32): + v297 = copy v290 + v298 = iadd v285, v286 + v299 = iadd.i32 v2, v298 + v300 = iconst.i32 0x4ffe + v301 = icmp uge v299, v300 + brz v301, ebb32 + trap heap_oob + +ebb32: + v302 = uextend.i64 v299 + v303 = iadd_imm.i64 v0, -8 + v304 = load.i64 v303 + v305 = iadd v304, v302 + v306 = uload8.i32 v305 + v307 = iconst.i32 255 + v308 = band.i32 v287, v307 + v309 = icmp ne v306, v308 + v310 = bint.i32 v309 + brnz v310, ebb33 + v311 = icmp.i32 ne v286, v288 + v312 = bint.i32 v311 + brnz v312, ebb34 + v313 = iadd.i32 v289, v288 + v314 = iconst.i32 1 + jump ebb36(v314, v313, v285, v288) + +ebb33: + v315 = icmp.i32 uge v306, v308 + v316 = bint.i32 v315 + brnz v316, ebb35 + v317 = isub.i32 v297, v285 + v318 = iconst.i32 1 + jump ebb36(v318, v297, v285, v317) + +ebb34: + v319 = iconst.i32 1 + v320 = iadd.i32 v286, v319 + v321 = copy.i32 v289 + jump ebb36(v320, v321, v285, v288) + +ebb35: + v322 = iconst.i32 1 + v323 = iconst.i32 1 + v324 = iadd.i32 v289, v323 + v325 = iconst.i32 1 + v326 = copy.i32 v289 + jump ebb36(v322, v324, v326, v325) + +ebb36(v327: i32, v328: i32, v329: i32, v330: i32): + v331 = iadd v327, v328 + v332 = icmp uge v331, v291 + v333 = bint.i32 v332 + brnz v333, ebb38(v2, v330, v292, v329, v293, v294, v291, v295, v296) + v334 = iadd.i32 v2, v331 + v335 = iconst.i32 0x4ffe + v336 = icmp uge v334, v335 + brz v336, ebb37 + trap heap_oob + +ebb37: + v337 = uextend.i64 v334 + v338 = iadd_imm.i64 v0, -8 + v339 = load.i64 v338 + v340 = iadd v339, v337 + v341 = uload8.i32 v340 + v342 = copy.i32 v328 + jump ebb31(v329, v327, v341, v330, v342, v331, v291, v292, v293, v294, v295, v296) + +ebb38(v343: i32, v344: i32, v345: i32, v346: i32, v347: i32, v348: i32, v349: i32, v350: i32, v351: i32): + v352 = iconst.i32 1 + v353 = iadd v346, v352 + v354 = iconst.i32 1 + v355 = iadd v347, v354 + v356 = icmp ugt v353, v355 + v357 = bint.i32 v356 + brnz v357, ebb39(v344) + v358 = copy v345 + jump ebb39(v358) + +ebb39(v359: i32): + v360 = iadd.i32 v343, v359 + brnz.i32 v357, ebb40(v346) + v361 = copy.i32 v347 + jump ebb40(v361) + +ebb40(v362: i32): + v363 = iconst.i32 1 + v364 = iadd v362, v363 + v365 = call fn1(v0, v343, v360, v364) + v366 = iconst.i32 0 + v367 = icmp eq v365, v366 + v368 = bint.i32 v367 + brnz v368, ebb63 + v369 = iconst.i32 1 + v370 = iadd v362, v369 + v371 = isub.i32 v348, v370 + v372 = iconst.i32 1 + v373 = iadd v371, v372 + v374 = icmp ugt v362, v373 + v375 = bint.i32 v374 + v376 = copy v362 + brnz v375, ebb41(v376) + v377 = copy v373 + jump ebb41(v377) + +ebb41(v378: i32): + v379 = iconst.i32 1 + v380 = iadd v378, v379 + v381 = iconst.i32 0 + jump ebb64(v380, v381) + +ebb42: + v382 = iconst.i32 8 + v383 = ishl.i32 v29, v382 + v384 = bor v383, v44 + v385 = iconst.i32 0x4ffe + v386 = icmp.i32 uge v33, v385 + brz v386, ebb43 + trap heap_oob + +ebb43: + v387 = uextend.i64 v33 + v388 = iadd_imm.i64 v0, -8 + v389 = load.i64 v388 + v390 = iadd v389, v387 + v391 = uload8.i32 v390 + jump ebb44(v391, v54, v33) + +ebb44(v392: i32, v393: i32, v394: i32): + v395 = iconst.i32 8 + v396 = ishl v392, v395 + v397 = iconst.i32 0xff00 + v398 = band v396, v397 + v399 = iconst.i32 255 + v400 = band v393, v399 + v401 = bor v398, v400 + v402 = icmp eq v401, v384 + v403 = bint.i32 v402 + brnz v403, ebb56(v394, v14) + v404 = iconst.i32 2 + v405 = iadd v394, v404 + v406 = iconst.i32 1 + v407 = iadd v394, v406 + v408 = iconst.i32 0x4ffe + v409 = icmp uge v405, v408 + brz v409, ebb45 + trap heap_oob + +ebb45: + v410 = uextend.i64 v405 + v411 = iadd_imm.i64 v0, -8 + v412 = load.i64 v411 + v413 = iadd v412, v410 + v414 = uload8.i32 v413 + brnz v414, ebb44(v401, v414, v407) + jump ebb90(v14, v34) + +ebb46: + v415 = iconst.i32 8 + v416 = ishl.i32 v74, v415 + v417 = iconst.i32 16 + v418 = ishl.i32 v54, v417 + v419 = bor v416, v418 + v420 = iconst.i32 0x4ffe + v421 = icmp.i32 uge v33, v420 + brz v421, ebb47 + trap heap_oob + +ebb47: + v422 = uextend.i64 v33 + v423 = iadd_imm.i64 v0, -8 + v424 = load.i64 v423 + v425 = iadd v424, v422 + v426 = uload8.i32 v425 + v427 = iconst.i32 24 + v428 = ishl v426, v427 + v429 = bor.i32 v419, v428 + v430 = iconst.i32 16 + v431 = ishl.i32 v44, v430 + v432 = iconst.i32 24 + v433 = ishl.i32 v29, v432 + v434 = bor v431, v433 + v435 = iconst.i32 8 + v436 = ishl.i32 v64, v435 + v437 = bor v434, v436 + v438 = icmp eq v429, v437 + v439 = bint.i32 v438 + brnz v439, ebb56(v33, v14) + jump ebb48(v33, v429) + +ebb48(v440: i32, v441: i32): + v442 = iconst.i32 1 + v443 = iadd v440, v442 + v444 = iconst.i32 3 + v445 = iadd v440, v444 + v446 = iconst.i32 0x4ffe + v447 = icmp uge v445, v446 + brz v447, ebb49 + trap heap_oob + +ebb49: + v448 = uextend.i64 v445 + v449 = iadd_imm.i64 v0, -8 + v450 = load.i64 v449 + v451 = iadd v450, v448 + v452 = uload8.i32 v451 + v453 = iconst.i32 0 + v454 = icmp eq v452, v453 + v455 = bint.i32 v454 + brnz v455, ebb51(v14) + v456 = bor.i32 v441, v452 + v457 = iconst.i32 8 + v458 = ishl v456, v457 + v459 = icmp ne v458, v437 + v460 = bint.i32 v459 + v461 = copy.i32 v443 + brnz v460, ebb48(v461, v458) + jump ebb50 + +ebb50: + jump ebb51(v14) + +ebb51(v462: i32): + v463 = iconst.i32 0 + v464 = iconst.i32 1056 + v465 = iadd v462, v464 + v466 = iconst.i32 0x4ffe + v467 = icmp uge v463, v466 + brz v467, ebb52 + trap heap_oob + +ebb52: + v468 = uextend.i64 v463 + v469 = iadd_imm.i64 v0, -8 + v470 = load.i64 v469 + v471 = iadd v470, v468 + store.i32 v465, v471+4 + v472 = iconst.i32 0 + brnz.i32 v452, ebb53(v443) + v473 = copy v472 + jump ebb53(v473) + +ebb53(v474: i32): + return v474 + +ebb54: + v475 = iconst.i32 8 + v476 = ishl.i32 v74, v475 + v477 = iconst.i32 16 + v478 = ishl.i32 v54, v477 + v479 = bor v476, v478 + v480 = bor v479, v94 + v481 = iconst.i32 0x4ffe + v482 = icmp.i32 uge v33, v481 + brz v482, ebb55 + trap heap_oob + +ebb55: + v483 = uextend.i64 v33 + v484 = iadd_imm.i64 v0, -8 + v485 = load.i64 v484 + v486 = iadd v485, v483 + v487 = uload8.i32 v486 + v488 = iconst.i32 24 + v489 = ishl v487, v488 + v490 = bor.i32 v480, v489 + v491 = iconst.i32 16 + v492 = ishl.i32 v44, v491 + v493 = iconst.i32 24 + v494 = ishl.i32 v29, v493 + v495 = bor v492, v494 + v496 = iconst.i32 8 + v497 = ishl.i32 v64, v496 + v498 = bor v495, v497 + v499 = bor v498, v84 + v500 = icmp ne v490, v499 + v501 = bint.i32 v500 + brnz v501, ebb57 + jump ebb56(v33, v14) + +ebb56(v502: i32, v503: i32): + v504 = copy v502 + jump ebb90(v503, v504) + +ebb57: + jump ebb58(v33, v490) + +ebb58(v505: i32, v506: i32): + v507 = iconst.i32 4 + v508 = iadd v505, v507 + v509 = iconst.i32 1 + v510 = iadd v505, v509 + v511 = iconst.i32 0x4ffe + v512 = icmp uge v508, v511 + brz v512, ebb59 + trap heap_oob + +ebb59: + v513 = uextend.i64 v508 + v514 = iadd_imm.i64 v0, -8 + v515 = load.i64 v514 + v516 = iadd v515, v513 + v517 = uload8.i32 v516 + v518 = iconst.i32 0 + v519 = icmp eq v517, v518 + v520 = bint.i32 v519 + brnz v520, ebb61(v14) + v521 = iconst.i32 8 + v522 = ishl.i32 v506, v521 + v523 = bor v522, v517 + v524 = icmp ne v523, v499 + v525 = bint.i32 v524 + brnz v525, ebb58(v510, v523) + jump ebb60 + +ebb60: + jump ebb61(v14) + +ebb61(v526: i32): + v527 = iconst.i32 0 + brnz.i32 v517, ebb62(v510) + v528 = copy v527 + jump ebb62(v528) + +ebb62(v529: i32): + v530 = copy v529 + jump ebb90(v526, v530) + +ebb63: + v531 = isub.i32 v348, v359 + v532 = iconst.i32 1 + v533 = iadd v531, v532 + jump ebb64(v359, v533) + +ebb64(v534: i32, v535: i32): + v536 = iconst.i32 1 + v537 = iadd.i32 v343, v536 + v538 = iconst.i32 0 + v539 = isub v538, v362 + v540 = iconst.i32 63 + v541 = bor.i32 v349, v540 + v542 = isub.i32 v348, v534 + v543 = iconst.i32 1 + v544 = iadd v542, v543 + v545 = iconst.i32 0 + v546 = copy.i32 v350 + jump ebb65(v350, v546, v349, v541, v348, v351, v544, v534, v545, v535, v343, v364, v537, v539, v362) + +ebb65(v547: i32, v548: i32, v549: i32, v550: i32, v551: i32, v552: i32, v553: i32, v554: i32, v555: i32, v556: i32, v557: i32, v558: i32, v559: i32, v560: i32, v561: i32): + v562 = copy v556 + v563 = isub v547, v548 + v564 = icmp uge v563, v549 + v565 = bint.i32 v564 + brnz v565, ebb67(v547) + v566 = iconst.i32 0 + v567 = call fn2(v0, v547, v566, v550) + brnz v567, ebb66 + v568 = iadd v547, v550 + jump ebb67(v568) + +ebb66: + v569 = isub.i32 v567, v548 + v570 = icmp ult v569, v549 + v571 = bint.i32 v570 + brnz v571, ebb89(v552) + v572 = copy.i32 v567 + jump ebb67(v572) + +ebb67(v573: i32): + v574 = iconst.i32 1 + v575 = iadd.i32 v548, v551 + v576 = iconst.i32 0x4ffe + v577 = icmp uge v575, v576 + brz v577, ebb68 + trap heap_oob + +ebb68: + v578 = uextend.i64 v575 + v579 = iadd_imm.i64 v0, -8 + v580 = load.i64 v579 + v581 = iadd v580, v578 + v582 = uload8.i32 v581 + v583 = iconst.i32 31 + v584 = band v582, v583 + v585 = ishl.i32 v574, v584 + v586 = iconst.i32 1024 + v587 = iadd.i32 v552, v586 + v588 = iconst.i32 3 + v589 = ushr v582, v588 + v590 = iconst.i32 28 + v591 = band v589, v590 + v592 = iadd v587, v591 + v593 = iconst.i32 0x4ffe + v594 = icmp uge v592, v593 + brz v594, ebb69 + trap heap_oob + +ebb69: + v595 = uextend.i64 v592 + v596 = iadd_imm.i64 v0, -8 + v597 = load.i64 v596 + v598 = iadd v597, v595 + v599 = load.i32 v598 + v600 = band.i32 v585, v599 + v601 = iconst.i32 0 + v602 = icmp eq v600, v601 + v603 = bint.i32 v602 + brnz v603, ebb74 + v604 = iconst.i32 2 + v605 = ishl.i32 v582, v604 + v606 = iadd.i32 v552, v605 + v607 = iconst.i32 0x4ffe + v608 = icmp uge v606, v607 + brz v608, ebb70 + trap heap_oob + +ebb70: + v609 = uextend.i64 v606 + v610 = iadd_imm.i64 v0, -8 + v611 = load.i64 v610 + v612 = iadd v611, v609 + v613 = load.i32 v612 + v614 = isub.i32 v551, v613 + v615 = iconst.i32 -1 + v616 = icmp eq v614, v615 + v617 = bint.i32 v616 + brnz v617, ebb75 + v618 = iconst.i32 1 + v619 = iadd v614, v618 + v620 = icmp ult v619, v554 + v621 = bint.i32 v620 + v622 = copy.i32 v553 + brnz v621, ebb71(v622) + v623 = copy v619 + jump ebb71(v623) + +ebb71(v624: i32): + v625 = copy v624 + brnz.i32 v555, ebb72(v625) + jump ebb72(v619) + +ebb72(v626: i32): + brnz.i32 v562, ebb73(v626) + jump ebb73(v619) + +ebb73(v627: i32): + v628 = copy.i32 v554 + v629 = copy.i32 v562 + jump ebb87(v548, v627, v573, v549, v550, v551, v552, v553, v628, v629, v557, v558, v559, v560, v561) + +ebb74: + v630 = copy.i32 v549 + v631 = copy.i32 v554 + v632 = copy.i32 v562 + jump ebb87(v548, v630, v573, v549, v550, v551, v552, v553, v631, v632, v557, v558, v559, v560, v561) + +ebb75: + v633 = icmp.i32 ugt v558, v555 + v634 = bint.i32 v633 + v635 = copy.i32 v558 + brnz v634, ebb76(v635) + v636 = copy.i32 v555 + jump ebb76(v636) + +ebb76(v637: i32): + v638 = iadd.i32 v557, v637 + v639 = iconst.i32 0x4ffe + v640 = icmp uge v638, v639 + brz v640, ebb77 + trap heap_oob + +ebb77: + v641 = uextend.i64 v638 + v642 = iadd_imm.i64 v0, -8 + v643 = load.i64 v642 + v644 = iadd v643, v641 + v645 = uload8.i32 v644 + v646 = iconst.i32 0 + v647 = icmp eq v645, v646 + v648 = bint.i32 v647 + brnz v648, ebb82(v548, v549, v551, v552) + v649 = iadd.i32 v548, v637 + v650 = iadd.i32 v559, v637 + v651 = iadd.i32 v560, v637 + jump ebb78(v645, v649, v651, v650) + +ebb78(v652: i32, v653: i32, v654: i32, v655: i32): + v656 = iconst.i32 255 + v657 = band v652, v656 + v658 = iconst.i32 0x4ffe + v659 = icmp uge v653, v658 + brz v659, ebb79 + trap heap_oob + +ebb79: + v660 = uextend.i64 v653 + v661 = iadd_imm.i64 v0, -8 + v662 = load.i64 v661 + v663 = iadd v662, v660 + v664 = uload8.i32 v663 + v665 = icmp.i32 ne v657, v664 + v666 = bint.i32 v665 + v667 = copy.i32 v554 + v668 = copy.i32 v562 + brnz v666, ebb87(v548, v654, v573, v549, v550, v551, v552, v553, v667, v668, v557, v558, v559, v560, v561) + v669 = iconst.i32 1 + v670 = iadd.i32 v653, v669 + v671 = iconst.i32 1 + v672 = iadd.i32 v654, v671 + v673 = iconst.i32 0x4ffe + v674 = icmp.i32 uge v655, v673 + brz v674, ebb80 + trap heap_oob + +ebb80: + v675 = uextend.i64 v655 + v676 = iadd_imm.i64 v0, -8 + v677 = load.i64 v676 + v678 = iadd v677, v675 + v679 = uload8.i32 v678 + v680 = iconst.i32 1 + v681 = iadd.i32 v655, v680 + brnz v679, ebb78(v679, v670, v672, v681) + jump ebb81 + +ebb81: + jump ebb82(v548, v549, v551, v552) + +ebb82(v682: i32, v683: i32, v684: i32, v685: i32): + v686 = icmp.i32 ule v558, v555 + v687 = bint.i32 v686 + brnz v687, ebb90(v685, v682) + v688 = copy.i32 v561 + jump ebb83(v688) + +ebb83(v689: i32): + v690 = iadd.i32 v557, v689 + v691 = iconst.i32 0x4ffe + v692 = icmp uge v690, v691 + brz v692, ebb84 + trap heap_oob + +ebb84: + v693 = uextend.i64 v690 + v694 = iadd_imm.i64 v0, -8 + v695 = load.i64 v694 + v696 = iadd v695, v693 + v697 = uload8.i32 v696 + v698 = iadd.i32 v682, v689 + v699 = iconst.i32 0x4ffe + v700 = icmp uge v698, v699 + brz v700, ebb85 + trap heap_oob + +ebb85: + v701 = uextend.i64 v698 + v702 = iadd_imm.i64 v0, -8 + v703 = load.i64 v702 + v704 = iadd v703, v701 + v705 = uload8.i32 v704 + v706 = icmp.i32 ne v697, v705 + v707 = bint.i32 v706 + brnz v707, ebb86 + v708 = icmp.i32 ule v689, v555 + v709 = bint.i32 v708 + v710 = iconst.i32 -1 + v711 = iadd.i32 v689, v710 + v712 = iconst.i32 0 + v713 = icmp eq v709, v712 + v714 = bint.i32 v713 + brnz v714, ebb83(v711) + jump ebb90(v685, v682) + +ebb86: + v715 = copy.i32 v554 + v716 = copy.i32 v562 + jump ebb88(v682, v554, v573, v683, v550, v684, v685, v553, v715, v562, v716, v557, v558, v559, v560, v561) + +ebb87(v717: i32, v718: i32, v719: i32, v720: i32, v721: i32, v722: i32, v723: i32, v724: i32, v725: i32, v726: i32, v727: i32, v728: i32, v729: i32, v730: i32, v731: i32): + v732 = copy v718 + v733 = iconst.i32 0 + jump ebb88(v717, v732, v719, v720, v721, v722, v723, v724, v725, v733, v726, v727, v728, v729, v730, v731) + +ebb88(v734: i32, v735: i32, v736: i32, v737: i32, v738: i32, v739: i32, v740: i32, v741: i32, v742: i32, v743: i32, v744: i32, v745: i32, v746: i32, v747: i32, v748: i32, v749: i32): + v750 = iadd v734, v735 + v751 = copy v742 + v752 = copy v743 + v753 = copy v744 + jump ebb65(v736, v750, v737, v738, v739, v740, v741, v751, v752, v753, v745, v746, v747, v748, v749) + +ebb89(v754: i32): + v755 = iconst.i32 0 + jump ebb90(v754, v755) + +ebb90(v756: i32, v757: i32): + v758 = iconst.i32 0 + v759 = iconst.i32 1056 + v760 = iadd v756, v759 + v761 = iconst.i32 0x4ffe + v762 = icmp uge v758, v761 + brz v762, ebb91 + trap heap_oob + +ebb91: + v763 = uextend.i64 v758 + v764 = iadd_imm.i64 v0, -8 + v765 = load.i64 v764 + v766 = iadd v765, v763 + store.i32 v760, v766+4 + jump ebb92(v757) + +ebb92(v767: i32): + return v767 +} + +; Same problem from musl.wasm. +function %musl(f64 [%xmm0], i64 vmctx [%rdi]) -> f64 [%xmm0] native { + gv0 = vmctx + heap0 = static gv0, min 0, bound 0x0001_0000_0000, guard 0x8000_0000 + sig0 = (f64 [%xmm0], i32 [%rdi], i64 vmctx [%rsi]) -> f64 [%xmm0] native + fn0 = sig0 u0:517 + +ebb0(v0: f64, v1: i64): + v3 = iconst.i64 0 + v4 = iconst.i32 0 + v131 = iconst.i64 0 + v5 = bitcast.f64 v131 + v6 = iconst.i32 0 + v7 = iconst.i32 0 + v8 = iconst.i32 0 + v132 = uextend.i64 v8 + v133 = iadd_imm v1, 0 + v134 = load.i64 v133 + v9 = iadd v134, v132 + v10 = load.i32 v9+4 + v11 = iconst.i32 16 + v12 = isub v10, v11 + v135 = uextend.i64 v7 + v136 = iadd_imm v1, 0 + v137 = load.i64 v136 + v13 = iadd v137, v135 + store v12, v13+4 + v14 = bitcast.i64 v0 + v15 = iconst.i64 63 + v16 = ushr v14, v15 + v17 = ireduce.i32 v16 + v18 = iconst.i64 32 + v19 = ushr v14, v18 + v20 = ireduce.i32 v19 + v21 = iconst.i32 0x7fff_ffff + v22 = band v20, v21 + v23 = iconst.i32 0x4086_232b + v24 = icmp ult v22, v23 + v25 = bint.i32 v24 + brnz v25, ebb10 + v26 = iconst.i64 0x7fff_ffff_ffff_ffff + v27 = band v14, v26 + v28 = iconst.i64 0x7ff0_0000_0000_0000 + v29 = icmp ule v27, v28 + v30 = bint.i32 v29 + brnz v30, ebb9 + jump ebb2(v12, v0) + +ebb10: + v31 = iconst.i32 0x3fd6_2e43 + v32 = icmp.i32 ult v22, v31 + v33 = bint.i32 v32 + brnz v33, ebb8 + v34 = iconst.i32 0x3ff0_a2b2 + v35 = icmp.i32 uge v22, v34 + v36 = bint.i32 v35 + brnz v36, ebb6 + v37 = iconst.i32 1 + v38 = bxor.i32 v17, v37 + v39 = isub v38, v17 + jump ebb5(v0, v39) + +ebb9: + v138 = iconst.i64 0x4086_2e42_fefa_39ef + v40 = bitcast.f64 v138 + v41 = fcmp ge v40, v0 + v42 = bint.i32 v41 + v139 = fcmp.f64 uno v0, v0 + v140 = fcmp.f64 one v0, v0 + v43 = bor v139, v140 + v44 = bint.i32 v43 + v45 = bor v42, v44 + brnz v45, ebb7 + v141 = iconst.i64 0x7fe0_0000_0000_0000 + v46 = bitcast.f64 v141 + v47 = fmul.f64 v0, v46 + jump ebb2(v12, v47) + +ebb8: + v48 = iconst.i32 0x3e30_0000 + v49 = icmp.i32 ule v22, v48 + v50 = bint.i32 v49 + brnz v50, ebb3 + v51 = iconst.i32 0 + v142 = iconst.i64 0 + v52 = bitcast.f64 v142 + v178 = copy.f64 v0 + jump ebb4(v0, v178, v52, v51) + +ebb7: + v143 = iconst.i64 0xc086_232b_dd7a_bcd2 + v53 = bitcast.f64 v143 + v54 = fcmp.f64 ge v0, v53 + v55 = bint.i32 v54 + v56 = bor v55, v44 + brnz v56, ebb6 + v144 = iconst.i64 0xb6a0_0000_0000_0000 + v57 = bitcast.f64 v144 + v58 = fdiv v57, v0 + v59 = fdemote.f32 v58 + v145 = uextend.i64 v12 + v146 = iadd_imm.i64 v1, 0 + v147 = load.i64 v146 + v60 = iadd v147, v145 + store v59, v60+12 + v148 = iconst.i64 0 + v61 = bitcast.f64 v148 + v149 = iconst.i64 0xc087_4910_d52d_3051 + v62 = bitcast.f64 v149 + v63 = fcmp gt v62, v0 + v64 = bint.i32 v63 + brnz v64, ebb2(v12, v61) + jump ebb6 + +ebb6: + v150 = iconst.i64 0x3ff7_1547_652b_82fe + v66 = bitcast.f64 v150 + v67 = fmul.f64 v0, v66 + v69 = iconst.i32 3 + v70 = ishl.i32 v17, v69 + v71 = iconst.i32 5040 + v72 = iadd v70, v71 + v151 = uextend.i64 v72 + v152 = iadd_imm.i64 v1, 0 + v153 = load.i64 v152 + v73 = iadd v153, v151 + v74 = load.f64 v73 + v75 = fadd v67, v74 + v76 = x86_cvtt2si.i32 v75 + v158 = iconst.i32 0x8000_0000 + v154 = icmp ne v76, v158 + brnz v154, ebb11 + v155 = fcmp uno v75, v75 + brz v155, ebb12 + trap bad_toint + +ebb12: + v159 = iconst.i64 0xc1e0_0000_0020_0000 + v156 = bitcast.f64 v159 + v157 = fcmp ge v156, v75 + brz v157, ebb13 + trap int_ovf + +ebb13: + jump ebb11 + +ebb11: + jump ebb5(v0, v76) + +ebb5(v77: f64, v78: i32): + v79 = fcvt_from_sint.f64 v78 + v160 = iconst.i64 0xbfe6_2e42_fee0_0000 + v80 = bitcast.f64 v160 + v81 = fmul v79, v80 + v82 = fadd v77, v81 + v161 = iconst.i64 0x3dea_39ef_3579_3c76 + v83 = bitcast.f64 v161 + v84 = fmul v79, v83 + v85 = fsub v82, v84 + jump ebb4(v82, v85, v84, v78) + +ebb4(v86: f64, v87: f64, v108: f64, v113: i32): + v88 = fmul v87, v87 + v162 = iconst.i64 0x3e66_3769_72be_a4d0 + v89 = bitcast.f64 v162 + v90 = fmul v88, v89 + v163 = iconst.i64 0xbebb_bd41_c5d2_6bf1 + v91 = bitcast.f64 v163 + v92 = fadd v90, v91 + v93 = fmul v88, v92 + v164 = iconst.i64 0x3f11_566a_af25_de2c + v94 = bitcast.f64 v164 + v95 = fadd v93, v94 + v96 = fmul v88, v95 + v165 = iconst.i64 0xbf66_c16c_16be_bd93 + v97 = bitcast.f64 v165 + v98 = fadd v96, v97 + v99 = fmul v88, v98 + v166 = iconst.i64 0x3fc5_5555_5555_553e + v100 = bitcast.f64 v166 + v101 = fadd v99, v100 + v102 = fmul v88, v101 + v103 = fsub v87, v102 + v104 = fmul v87, v103 + v167 = iconst.i64 0x4000_0000_0000_0000 + v105 = bitcast.f64 v167 + v106 = fsub v105, v103 + v107 = fdiv v104, v106 + v109 = fsub v107, v108 + v110 = fadd v86, v109 + v168 = iconst.i64 0x3ff0_0000_0000_0000 + v111 = bitcast.f64 v168 + v112 = fadd v110, v111 + v169 = iconst.i32 0 + v114 = icmp eq v113, v169 + v115 = bint.i32 v114 + brnz v115, ebb2(v12, v112) + v116 = call fn0(v112, v113, v1) + jump ebb2(v12, v116) + +ebb3: + v170 = iconst.i64 0x7fe0_0000_0000_0000 + v117 = bitcast.f64 v170 + v118 = fadd.f64 v0, v117 + v171 = uextend.i64 v12 + v172 = iadd_imm.i64 v1, 0 + v173 = load.i64 v172 + v119 = iadd v173, v171 + store v118, v119 + v174 = iconst.i64 0x3ff0_0000_0000_0000 + v120 = bitcast.f64 v174 + v121 = fadd.f64 v0, v120 + jump ebb2(v12, v121) + +ebb2(v123: i32, v130: f64): + v122 = iconst.i32 0 + v127 = iconst.i32 16 + v128 = iadd v123, v127 + v175 = uextend.i64 v122 + v176 = iadd_imm.i64 v1, 0 + v177 = load.i64 v176 + v129 = iadd v177, v175 + store v128, v129+4 + jump ebb1(v130) + +ebb1(v2: f64): + return v2 +} diff --git a/lib/cretonne/src/regalloc/coalescing.rs b/lib/cretonne/src/regalloc/coalescing.rs index 3ce58821bc..4819969605 100644 --- a/lib/cretonne/src/regalloc/coalescing.rs +++ b/lib/cretonne/src/regalloc/coalescing.rs @@ -7,14 +7,15 @@ use cursor::{Cursor, EncCursor}; use dbg::DisplayList; -use dominator_tree::DominatorTree; +use dominator_tree::{DominatorTree, DominatorTreePreorder}; use flowgraph::ControlFlowGraph; -use ir::{DataFlowGraph, Layout, InstBuilder, ValueDef}; +use ir::{Layout, InstBuilder, ValueDef}; use ir::{Function, Ebb, Inst, Value, ExpandedProgramPoint}; use regalloc::affinity::Affinity; use regalloc::liveness::Liveness; use regalloc::virtregs::VirtRegs; use std::cmp::Ordering; +use std::fmt; use std::iter::Peekable; use std::mem; use isa::{TargetIsa, EncInfo}; @@ -23,10 +24,10 @@ use timing; /// Dominator forest. /// /// This is a utility type used for merging virtual registers, where each virtual register is a -/// list of values ordered according to `DomTree::rpo_cmp`. +/// list of values ordered according to the dominator tree pre-order. /// /// A `DomForest` object is used as a buffer for building virtual registers. It lets you merge two -/// sorted lists of values while checking for interference only whee necessary. +/// sorted lists of values while checking for interference only where necessary. /// /// The idea of a dominator forest was introduced here: /// @@ -37,14 +38,20 @@ use timing; /// The linear stack representation here: /// /// Boissinot, B., Darte, A., & Rastello, F. (2009). Revisiting out-of-SSA translation for -/// correctness, code quality and efficiency. Presented at the Proceedings of the 7th …. +/// correctness, code quality and efficiency. +/// +/// Our version of the linear stack is slightly modified because we have a pre-order of the +/// dominator tree at the EBB granularity, not basic block granularity. struct DomForest { - // The sequence of values that have been merged so far. In RPO order of their defs. + // The sequence of values that have been merged so far. + // In domtree pre-order order of their definitions. values: Vec, // Stack representing the rightmost edge of the dominator forest so far, ending in the last - // element of `values`. At all times, each element in the stack dominates the next one, and all - // elements dominating the end of `values` are on the stack. + // element of `values`. + // + // At all times, the EBB of each element in the stack dominates the EBB of the next one, and + // all elements dominating the end of `values` are on the stack. stack: Vec, } @@ -56,22 +63,29 @@ struct Node { set: u8, /// The program point where `value` is defined. def: ExpandedProgramPoint, + /// EBB containing `def`. + ebb: Ebb, } impl Node { /// Create a node for `value`. - pub fn new(value: Value, set: u8, dfg: &DataFlowGraph) -> Node { + pub fn new(value: Value, set: u8, func: &Function) -> Node { + let def = func.dfg.value_def(value).into(); + let ebb = func.layout.pp_ebb(def); Node { value, set, - def: dfg.value_def(value).into(), + def, + ebb, } } } -/// Push a node to `stack` and update `stack` so it contains all dominator forest ancestors of -/// the pushed value. -/// +impl fmt::Display for Node { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}@{}:{}", self.value, self.ebb, self.set) + } +} impl DomForest { /// Create a new empty dominator forest. @@ -103,23 +117,51 @@ impl DomForest { /// /// If the pushed node's parent in the dominator forest belongs to a different set, returns /// `Some(parent)`. - fn push_node(&mut self, node: Node, layout: &Layout, domtree: &DominatorTree) -> Option { + fn push_node( + &mut self, + node: Node, + layout: &Layout, + domtree: &DominatorTree, + preorder: &DominatorTreePreorder, + ) -> Option { self.values.push(node.value); // The stack contains the current sequence of dominating defs. Pop elements until we - // find one that dominates `node`. + // find one whose EBB dominates `node.ebb`. while let Some(top) = self.stack.pop() { - if domtree.dominates(top.def, node.def, layout) { + if preorder.dominates(top.ebb, node.ebb) { // This is the right insertion spot for `node`. self.stack.push(top); self.stack.push(node); + + // We know here that `top.ebb` dominates `node.ebb`, and thus `node.def`. This does + // not necessarily mean that `top.def` dominates `node.def`, though. The `top.def` + // program point may be below the last branch in `top.ebb` that dominates + // `node.def`. + debug_assert!(domtree.dominates(top.ebb, node.def, layout)); + + // We do know, though, that if there is a nearest value dominating `node.def`, it + // will be on the stack. We just need to find the last stack entry that actually + // dominates. + // + // TODO: This search could be more efficient if we had access to + // `domtree.last_dominator()`. Each call to `dominates()` here ends up walking up + // the dominator tree starting from `node.ebb`. + let dom = self.stack[0..self.stack.len() - 1].iter().rposition(|n| { + domtree.dominates(n.def, node.def, layout) + }); + // If the parent value comes from a different set, return it for interference // checking. If the sets are equal, assume that interference is already handled. - if top.set != node.set { - return Some(top.value); - } else { - return None; + if let Some(pos) = dom { + let parent = &self.stack[pos]; + if parent.set != node.set { + return Some(parent.value); + } } + + // There was no opposite-set value dominating `node.def`. + return None; } } @@ -143,9 +185,9 @@ impl DomForest { &mut self, va: &[Value], vb: &[Value], - dfg: &DataFlowGraph, - layout: &Layout, + func: &Function, domtree: &DominatorTree, + preorder: &DominatorTreePreorder, liveness: &Liveness, ) -> Result<(), (Value, Value)> { self.clear(); @@ -153,16 +195,16 @@ impl DomForest { // Convert the two value lists into a merged sequence of nodes. let merged = MergedNodes { - a: va.iter().map(|&value| Node::new(value, 0, dfg)).peekable(), - b: vb.iter().map(|&value| Node::new(value, 1, dfg)).peekable(), - layout, - domtree, + a: va.iter().map(|&value| Node::new(value, 0, func)).peekable(), + b: vb.iter().map(|&value| Node::new(value, 1, func)).peekable(), + layout: &func.layout, + preorder, }; - let ctx = liveness.context(layout); + let ctx = liveness.context(&func.layout); for node in merged { - if let Some(parent) = self.push_node(node, layout, domtree) { + if let Some(parent) = self.push_node(node, &func.layout, domtree, preorder) { // Check if `parent` live range contains `node.def`. - if liveness[parent].overlaps_def(node.def, layout.pp_ebb(node.def), ctx) { + if liveness[parent].overlaps_def(node.def, func.layout.pp_ebb(node.def), ctx) { // Interference detected. Get the `(a, b)` order right in the error. return Err(if node.set == 0 { (node.value, parent) @@ -189,7 +231,7 @@ where a: Peekable, b: Peekable, layout: &'a Layout, - domtree: &'a DominatorTree, + preorder: &'a DominatorTreePreorder, } impl<'a, IA, IB> Iterator for MergedNodes<'a, IA, IB> @@ -205,7 +247,7 @@ where // If the two values are defined at the same point, compare value numbers instead // this is going to cause an interference conflict unless its actually the same // value appearing in both streams. - self.domtree.rpo_cmp(a.def, b.def, self.layout).then( + self.preorder.pre_cmp(a.def, b.def, self.layout).then( Ord::cmp( &a.value, &b.value, @@ -231,6 +273,7 @@ where /// Data structures to be used by the coalescing pass. pub struct Coalescing { forest: DomForest, + preorder: DominatorTreePreorder, // Current set of coalesced values. Kept sorted and interference free. values: Vec, @@ -247,6 +290,7 @@ struct Context<'a> { func: &'a mut Function, cfg: &'a ControlFlowGraph, domtree: &'a DominatorTree, + preorder: &'a DominatorTreePreorder, liveness: &'a mut Liveness, virtregs: &'a mut VirtRegs, @@ -260,6 +304,7 @@ impl Coalescing { pub fn new() -> Self { Self { forest: DomForest::new(), + preorder: DominatorTreePreorder::new(), values: Vec::new(), split_values: Vec::new(), } @@ -285,12 +330,14 @@ impl Coalescing { ) { let _tt = timing::ra_cssa(); dbg!("Coalescing for:\n{}", func.display(isa)); + self.preorder.compute(domtree, &func.layout); let mut context = Context { isa, encinfo: isa.encoding_info(), func, cfg, domtree, + preorder: &self.preorder, liveness, virtregs, forest: &mut self.forest, @@ -486,9 +533,9 @@ impl<'a> Context<'a> { self.forest.try_merge( self.values, self.virtregs.congruence_class(&value), - &self.func.dfg, - &self.func.layout, + self.func, self.domtree, + self.preorder, self.liveness, )?; self.forest.swap(&mut self.values);