Path: blob/21.2-virgl/src/freedreno/ir3/ir3_ra.c
4565 views
/*1* Copyright (C) 2021 Valve Corporation2* Copyright (C) 2014 Rob Clark <[email protected]>3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,20* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "ir3_ra.h"25#include "util/rb_tree.h"26#include "util/u_math.h"27#include "ir3_shader.h"2829/* This file implements an SSA-based register allocator. Unlike other30* SSA-based allocators, it handles vector split/collect "smartly," meaning31* that multiple values may share the same register interval. From the32* perspective of the allocator itself, only the top-level intervals matter,33* and the allocator is only concerned with allocating top-level intervals,34* which may mean moving other top-level intervals around. Other intervals,35* like the destination of a split instruction or the source of a collect36* instruction, are "locked" to their parent interval. The details of this are37* mostly handled by ir3_merge_regs and ir3_reg_ctx.38*39* We currently don't do any backtracking, but we do use the merge sets as a40* form of affinity to try to avoid moves from phis/splits/collects. Each41* merge set is what a more "classic" graph-coloring or live-range based42* allocator would consider a single register, but here we use it as merely a43* hint, except when multiple overlapping values are live at the same time.44* Each merge set has a "preferred" register, and we try to honor that when45* allocating values in the merge set.46*/4748/* ir3_reg_ctx implementation. */4950static int51ir3_reg_interval_cmp(const struct rb_node *node, const void *data)52{53physreg_t reg = *(const physreg_t *)data;54const struct ir3_reg_interval *interval =55ir3_rb_node_to_interval_const(node);56if (interval->reg->interval_start > reg)57return -1;58else if (interval->reg->interval_end <= reg)59return 1;60else61return 0;62}6364static struct ir3_reg_interval *65ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)66{67struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);68return node ? ir3_rb_node_to_interval(node) : NULL;69}7071static struct ir3_reg_interval *72ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)73{74struct rb_node *node =75rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);76return node ? ir3_rb_node_to_interval(node) : NULL;77}7879/* Get the interval covering the reg, or the closest to the right if it80* doesn't exist.81*/82static struct ir3_reg_interval *83ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)84{85struct ir3_reg_interval *interval =86ir3_reg_interval_search_sloppy(tree, offset);87if (!interval) {88return NULL;89} else if (interval->reg->interval_end > offset) {90return interval;91} else {92/* There is no interval covering reg, and ra_file_search_sloppy()93* returned the closest range to the left, so the next interval to the94* right should be the closest to the right.95*/96return ir3_reg_interval_next_or_null(interval);97}98}99100static int101ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)102{103const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);104const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);105return b->reg->interval_start - a->reg->interval_start;106}107108static void109interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,110struct ir3_reg_interval *interval)111{112struct ir3_reg_interval *right =113ir3_reg_interval_search_right(tree, interval->reg->interval_start);114if (right && right->reg->interval_start < interval->reg->interval_end) {115/* We disallow trees where different members have different half-ness.116* This means that we can't treat bitcasts as copies like normal117* split/collect, so something like this would require an extra copy118* in mergedregs mode, and count as 4 half-units of register pressure119* instead of 2:120*121* f16vec2 foo = unpackFloat2x16(bar)122* ... = foo.x123* ... = bar124*125* However, relaxing this rule would open a huge can of worms. What126* happens when there's a vector of 16 things, and the fifth element127* has been bitcasted as a half-reg? Would that element alone have to128* be small enough to be used as a half-reg source? Let's keep that129* can of worms firmly shut for now.130*/131assert((interval->reg->flags & IR3_REG_HALF) ==132(right->reg->flags & IR3_REG_HALF));133134if (right->reg->interval_end <= interval->reg->interval_end &&135right->reg->interval_start >= interval->reg->interval_start) {136/* Check if we're inserting something that's already inserted */137assert(interval != right);138139/* "right" is contained in "interval" and must become a child of140* it. There may be further children too.141*/142for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);143right && right->reg->interval_start < interval->reg->interval_end;144right = next, next = ir3_reg_interval_next_or_null(next)) {145/* "right" must be contained in "interval." */146assert(right->reg->interval_end <= interval->reg->interval_end);147assert((interval->reg->flags & IR3_REG_HALF) ==148(right->reg->flags & IR3_REG_HALF));149if (!right->parent)150ctx->interval_delete(ctx, right);151right->parent = interval;152rb_tree_remove(tree, &right->node);153rb_tree_insert(&interval->children, &right->node,154ir3_reg_interval_insert_cmp);155}156} else {157/* "right" must contain "interval," since intervals must form a158* tree.159*/160assert(right->reg->interval_start <= interval->reg->interval_start);161interval->parent = right;162interval_insert(ctx, &right->children, interval);163return;164}165}166167if (!interval->parent)168ctx->interval_add(ctx, interval);169rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);170interval->inserted = true;171}172173void174ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,175struct ir3_reg_interval *interval)176{177interval_insert(ctx, &ctx->intervals, interval);178}179180void181ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,182struct ir3_reg_interval *interval)183{184if (interval->parent) {185rb_tree_remove(&interval->parent->children, &interval->node);186} else {187ctx->interval_delete(ctx, interval);188rb_tree_remove(&ctx->intervals, &interval->node);189}190191rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,192node) {193rb_tree_remove(&interval->children, &child->node);194child->parent = interval->parent;195196if (interval->parent) {197rb_tree_insert(&child->parent->children, &child->node,198ir3_reg_interval_insert_cmp);199} else {200ctx->interval_readd(ctx, interval, child);201rb_tree_insert(&ctx->intervals, &child->node,202ir3_reg_interval_insert_cmp);203}204}205206interval->inserted = false;207}208209void210ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,211struct ir3_reg_interval *interval)212{213assert(!interval->parent);214215ctx->interval_delete(ctx, interval);216rb_tree_remove(&ctx->intervals, &interval->node);217}218219static void220interval_dump(struct ir3_reg_interval *interval, unsigned indent)221{222for (unsigned i = 0; i < indent; i++)223printf("\t");224printf("reg %u start %u\n", interval->reg->name,225interval->reg->interval_start);226227rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {228interval_dump(child, indent + 1);229}230231for (unsigned i = 0; i < indent; i++)232printf("\t");233printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);234}235236void237ir3_reg_interval_dump(struct ir3_reg_interval *interval)238{239interval_dump(interval, 0);240}241242/* These are the core datastructures used by the register allocator. First243* ra_interval and ra_file, which are used for intra-block tracking and use244* the ir3_reg_ctx infrastructure:245*/246247struct ra_interval {248struct ir3_reg_interval interval;249250struct rb_node physreg_node;251physreg_t physreg_start, physreg_end;252253/* True if this is a source of the current instruction which is entirely254* killed. This means we can allocate the dest over it, but we can't break255* it up.256*/257bool is_killed;258259/* True if this interval cannot be moved from its position. This is only260* used for precolored inputs to ensure that other inputs don't get261* allocated on top of them.262*/263bool frozen;264};265266struct ra_file {267struct ir3_reg_ctx reg_ctx;268269BITSET_DECLARE(available, RA_MAX_FILE_SIZE);270BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);271272struct rb_tree physreg_intervals;273274unsigned size;275unsigned start;276};277278/* State for inter-block tracking. When we split a live range to make space279* for a vector, we may need to insert fixup code when a block has multiple280* predecessors that have moved the same live value to different registers.281* This keeps track of state required to do that.282*/283284struct ra_block_state {285/* Map of defining ir3_register -> physreg it was allocated to at the end286* of the block.287*/288struct hash_table *renames;289290/* For loops, we need to process a block before all its predecessors have291* been processed. In particular, we need to pick registers for values292* without knowing if all the predecessors have been renamed. This keeps293* track of the registers we chose so that when we visit the back-edge we294* can move them appropriately. If all predecessors have been visited295* before this block is visited then we don't need to fill this out. This296* is a map from ir3_register -> physreg.297*/298struct hash_table *entry_regs;299300/* True if the block has been visited and "renames" is complete.301*/302bool visited;303304/* True if the block is unreachable via the logical CFG. This happens for305* blocks after an if where both sides end in a break/continue. We ignore306* it for everything but shared registers.307*/308bool logical_unreachable;309};310311struct ra_parallel_copy {312struct ra_interval *interval;313physreg_t src;314};315316/* The main context: */317318struct ra_ctx {319/* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom320* half of this file too.321*/322struct ra_file full;323324/* hr0.x - hr63.w, only used without merged-regs. */325struct ra_file half;326327/* Shared regs. */328struct ra_file shared;329330struct ir3 *ir;331332struct ir3_liveness *live;333334struct ir3_block *block;335336const struct ir3_compiler *compiler;337gl_shader_stage stage;338339/* Pending moves of top-level intervals that will be emitted once we're340* finished:341*/342DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);343344struct ra_interval *intervals;345struct ra_block_state *blocks;346347bool merged_regs;348};349350#define foreach_interval(interval, file) \351rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \352physreg_node)353#define foreach_interval_rev(interval, file) \354rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \355physreg_node)356#define foreach_interval_safe(interval, file) \357rb_tree_foreach_safe (struct ra_interval, interval, \358&(file)->physreg_intervals, physreg_node)359#define foreach_interval_rev_safe(interval, file) \360rb_tree_foreach_rev_safe(struct ra_interval, interval, \361&(file)->physreg_intervals, physreg_node)362363static struct ra_interval *364rb_node_to_interval(struct rb_node *node)365{366return rb_node_data(struct ra_interval, node, physreg_node);367}368369static const struct ra_interval *370rb_node_to_interval_const(const struct rb_node *node)371{372return rb_node_data(struct ra_interval, node, physreg_node);373}374375static struct ra_interval *376ra_interval_next(struct ra_interval *interval)377{378struct rb_node *next = rb_node_next(&interval->physreg_node);379return next ? rb_node_to_interval(next) : NULL;380}381382static struct ra_interval *383ra_interval_next_or_null(struct ra_interval *interval)384{385return interval ? ra_interval_next(interval) : NULL;386}387388static int389ra_interval_cmp(const struct rb_node *node, const void *data)390{391physreg_t reg = *(const physreg_t *)data;392const struct ra_interval *interval = rb_node_to_interval_const(node);393if (interval->physreg_start > reg)394return -1;395else if (interval->physreg_end <= reg)396return 1;397else398return 0;399}400401static struct ra_interval *402ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)403{404struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);405return node ? rb_node_to_interval(node) : NULL;406}407408/* Get the interval covering the reg, or the closest to the right if it409* doesn't exist.410*/411static struct ra_interval *412ra_interval_search_right(struct rb_tree *tree, physreg_t reg)413{414struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);415if (!interval) {416return NULL;417} else if (interval->physreg_end > reg) {418return interval;419} else {420/* There is no interval covering reg, and ra_file_search_sloppy()421* returned the closest range to the left, so the next interval to the422* right should be the closest to the right.423*/424return ra_interval_next_or_null(interval);425}426}427428static struct ra_interval *429ra_file_search_right(struct ra_file *file, physreg_t reg)430{431return ra_interval_search_right(&file->physreg_intervals, reg);432}433434static int435ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)436{437const struct ra_interval *a = rb_node_to_interval_const(_a);438const struct ra_interval *b = rb_node_to_interval_const(_b);439return b->physreg_start - a->physreg_start;440}441442static struct ra_interval *443ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)444{445return rb_node_data(struct ra_interval, interval, interval);446}447448static struct ra_file *449ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)450{451return rb_node_data(struct ra_file, ctx, reg_ctx);452}453454static void455interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)456{457struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);458struct ra_file *file = ir3_reg_ctx_to_file(ctx);459460/* We can assume in this case that physreg_start/physreg_end is already461* initialized.462*/463for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {464BITSET_CLEAR(file->available, i);465BITSET_CLEAR(file->available_to_evict, i);466}467468rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,469ra_interval_insert_cmp);470}471472static void473interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)474{475struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);476struct ra_file *file = ir3_reg_ctx_to_file(ctx);477478for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {479BITSET_SET(file->available, i);480BITSET_SET(file->available_to_evict, i);481}482483rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);484}485486static void487interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,488struct ir3_reg_interval *_child)489{490struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);491struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);492493child->physreg_start =494parent->physreg_start + (child->interval.reg->interval_start -495parent->interval.reg->interval_start);496child->physreg_end =497child->physreg_start +498(child->interval.reg->interval_end - child->interval.reg->interval_start);499500interval_add(ctx, _child);501}502503static void504ra_file_init(struct ra_file *file)505{506for (unsigned i = 0; i < file->size; i++) {507BITSET_SET(file->available, i);508BITSET_SET(file->available_to_evict, i);509}510511file->start = 0;512513rb_tree_init(&file->reg_ctx.intervals);514rb_tree_init(&file->physreg_intervals);515516file->reg_ctx.interval_add = interval_add;517file->reg_ctx.interval_delete = interval_delete;518file->reg_ctx.interval_readd = interval_readd;519}520521static void522ra_file_insert(struct ra_file *file, struct ra_interval *interval)523{524assert(interval->physreg_start < interval->physreg_end);525assert(interval->physreg_end <= file->size);526if (interval->interval.reg->flags & IR3_REG_HALF)527assert(interval->physreg_end <= RA_HALF_SIZE);528529ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);530}531532static void533ra_file_remove(struct ra_file *file, struct ra_interval *interval)534{535ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);536}537538static void539ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)540{541assert(!interval->interval.parent);542543for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {544BITSET_SET(file->available, i);545}546547interval->is_killed = true;548}549550static physreg_t551ra_interval_get_physreg(const struct ra_interval *interval)552{553unsigned child_start = interval->interval.reg->interval_start;554555while (interval->interval.parent) {556interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);557}558559return interval->physreg_start +560(child_start - interval->interval.reg->interval_start);561}562563static unsigned564ra_interval_get_num(const struct ra_interval *interval)565{566return ra_physreg_to_num(ra_interval_get_physreg(interval),567interval->interval.reg->flags);568}569570static void571ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)572{573ir3_reg_interval_init(&interval->interval, reg);574interval->is_killed = false;575interval->frozen = false;576}577578static void579ra_interval_dump(struct ra_interval *interval)580{581printf("physreg %u ", interval->physreg_start);582583ir3_reg_interval_dump(&interval->interval);584}585586static void587ra_file_dump(struct ra_file *file)588{589rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,590physreg_node) {591ra_interval_dump(interval);592}593594unsigned start, end;595printf("available:\n");596BITSET_FOREACH_RANGE (start, end, file->available, file->size) {597printf("%u-%u ", start, end);598}599printf("\n");600601printf("available to evict:\n");602BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {603printf("%u-%u ", start, end);604}605printf("\n");606printf("start: %u\n", file->start);607}608609static void610ra_ctx_dump(struct ra_ctx *ctx)611{612printf("full:\n");613ra_file_dump(&ctx->full);614printf("half:\n");615ra_file_dump(&ctx->half);616printf("shared:\n");617ra_file_dump(&ctx->shared);618}619620static unsigned621reg_file_size(struct ra_file *file, struct ir3_register *reg)622{623/* Half-regs can only take up the first half of the combined regfile */624if (reg->flags & IR3_REG_HALF)625return MIN2(file->size, RA_HALF_SIZE);626else627return file->size;628}629630/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple631* top-level intervals at once. Pop multiple intervals, then push them back in632* any order.633*/634635struct ra_removed_interval {636struct ra_interval *interval;637unsigned size;638};639640static struct ra_removed_interval641ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,642struct ra_interval *interval)643{644assert(!interval->interval.parent);645646/* Check if we've already moved this reg before */647unsigned pcopy_index;648for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;649pcopy_index++) {650if (ctx->parallel_copies[pcopy_index].interval == interval)651break;652}653654if (pcopy_index == ctx->parallel_copies_count) {655array_insert(ctx, ctx->parallel_copies,656(struct ra_parallel_copy){657.interval = interval,658.src = interval->physreg_start,659});660}661662ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);663664return (struct ra_removed_interval){665.interval = interval,666.size = interval->physreg_end - interval->physreg_start,667};668}669670static void671ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,672const struct ra_removed_interval *removed, physreg_t dst)673{674struct ra_interval *interval = removed->interval;675676interval->physreg_start = dst;677interval->physreg_end = dst + removed->size;678679ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);680}681682/* Pick up the interval and place it at "dst". */683static void684ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,685struct ra_interval *interval, physreg_t dst)686{687struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);688ra_push_interval(ctx, file, &temp, dst);689}690691static bool692get_reg_specified(struct ra_file *file, struct ir3_register *reg,693physreg_t physreg, bool is_source)694{695for (unsigned i = 0; i < reg_size(reg); i++) {696if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,697physreg + i))698return false;699}700701return true;702}703704/* Try to evict any registers conflicting with the proposed spot "physreg" for705* "reg". That is, move them to other places so that we can allocate "physreg"706* here.707*/708709static bool710try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,711struct ir3_register *reg, physreg_t physreg,712unsigned *_eviction_count, bool is_source, bool speculative)713{714BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);715memcpy(available_to_evict, file->available_to_evict,716sizeof(available_to_evict));717718for (unsigned i = 0; i < reg_size(reg); i++)719BITSET_CLEAR(available_to_evict, physreg + i);720721unsigned eviction_count = 0;722/* Iterate over each range conflicting with physreg */723for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),724*next = ra_interval_next_or_null(conflicting);725conflicting != NULL &&726conflicting->physreg_start < physreg + reg_size(reg);727conflicting = next, next = ra_interval_next_or_null(next)) {728if (!is_source && conflicting->is_killed)729continue;730731if (conflicting->frozen) {732assert(speculative);733return false;734}735736unsigned avail_start, avail_end;737bool evicted = false;738BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,739reg_file_size(file, conflicting->interval.reg)) {740unsigned size = avail_end - avail_start;741742/* non-half registers must be aligned */743if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&744avail_start % 2 == 1) {745avail_start++;746size--;747}748749if (size >= conflicting->physreg_end - conflicting->physreg_start) {750for (unsigned i = 0;751i < conflicting->physreg_end - conflicting->physreg_start; i++)752BITSET_CLEAR(available_to_evict, avail_start + i);753eviction_count +=754conflicting->physreg_end - conflicting->physreg_start;755if (!speculative)756ra_move_interval(ctx, file, conflicting, avail_start);757evicted = true;758break;759}760}761762if (!evicted)763return false;764}765766*_eviction_count = eviction_count;767return true;768}769770static int771removed_interval_cmp(const void *_i1, const void *_i2)772{773const struct ra_removed_interval *i1 = _i1;774const struct ra_removed_interval *i2 = _i2;775776/* We sort the registers as follows:777*778* |--------------------------------------------------------------------|779* | | | | |780* | Half live-through | Half killed | Full killed | Full live-through |781* | | | | |782* |--------------------------------------------------------------------|783* | |784* | Destination |785* | |786* |-----------------|787*788* Half-registers have to be first so that they stay in the low half of789* the register file. Then half and full killed must stay together so that790* there's a contiguous range where we can put the register. With this791* structure we should be able to accomodate any collection of intervals792* such that the total number of half components is within the half limit793* and the combined components are within the full limit.794*/795796unsigned i1_align = reg_elem_size(i1->interval->interval.reg);797unsigned i2_align = reg_elem_size(i2->interval->interval.reg);798if (i1_align > i2_align)799return 1;800if (i1_align < i2_align)801return -1;802803if (i1_align == 1) {804if (i2->interval->is_killed)805return -1;806if (i1->interval->is_killed)807return 1;808} else {809if (i2->interval->is_killed)810return 1;811if (i1->interval->is_killed)812return -1;813}814815return 0;816}817818/* "Compress" all the live intervals so that there is enough space for the819* destination register. As there can be gaps when a more-aligned interval820* follows a less-aligned interval, this also sorts them to remove such821* "padding", which may be required when space is very tight. This isn't822* amazing, but should be used only as a last resort in case the register file823* is almost full and badly fragmented.824*825* Return the physreg to use.826*/827static physreg_t828compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,829unsigned align, bool is_source)830{831DECLARE_ARRAY(struct ra_removed_interval, intervals);832intervals_count = intervals_sz = 0;833intervals = NULL;834835unsigned removed_full_size = 0;836unsigned removed_half_size = 0;837unsigned file_size =838align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;839physreg_t start_reg = 0;840841foreach_interval_rev_safe (interval, file) {842/* Check if we can sort the intervals *after* this one and have843* enough space leftover to accomodate "size" units.844*/845if (align == 1) {846if (interval->physreg_end + removed_half_size <= file_size - size) {847start_reg = interval->physreg_end;848break;849}850} else {851if (interval->physreg_end + removed_half_size <=852file_size - removed_full_size - size) {853start_reg = interval->physreg_end;854break;855}856}857858/* We assume that all frozen intervals are at the start and that we859* can avoid popping them.860*/861assert(!interval->frozen);862863/* Killed sources don't count because they go at the end and can864* overlap the register we're trying to add.865*/866if (!interval->is_killed && !is_source) {867if (interval->interval.reg->flags & IR3_REG_HALF)868removed_half_size +=869interval->physreg_end - interval->physreg_start;870else871removed_full_size +=872interval->physreg_end - interval->physreg_start;873}874875/* Now that we've done the accounting, pop this off */876d("popping interval %u physreg %u\n", interval->interval.reg->name,877interval->physreg_start);878array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));879}880881/* TODO: In addition to skipping registers at the beginning that are882* well-packed, we should try to skip registers at the end.883*/884885qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);886887physreg_t physreg = start_reg;888physreg_t ret_reg = (physreg_t)~0;889for (unsigned i = 0; i < intervals_count; i++) {890if (ret_reg == (physreg_t)~0 &&891((intervals[i].interval->is_killed && !is_source) ||892!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {893ret_reg = ALIGN(physreg, align);894}895896if (ret_reg != (physreg_t)~0 &&897(is_source || !intervals[i].interval->is_killed)) {898physreg = MAX2(physreg, ret_reg + size);899}900901if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {902physreg = ALIGN(physreg, 2);903}904905if (physreg + intervals[i].size >906reg_file_size(file, intervals[i].interval->interval.reg)) {907d("ran out of room for interval %u!\n",908intervals[i].interval->interval.reg->name);909unreachable("reg pressure calculation was wrong!");910return 0;911}912913d("pushing interval %u physreg %u\n",914intervals[i].interval->interval.reg->name, physreg);915ra_push_interval(ctx, file, &intervals[i], physreg);916917physreg += intervals[i].size;918}919920if (ret_reg == (physreg_t)~0)921ret_reg = physreg;922923ret_reg = ALIGN(ret_reg, align);924if (ret_reg + size > file_size) {925d("ran out of room for the new interval!\n");926unreachable("reg pressure calculation was wrong!");927return 0;928}929930return ret_reg;931}932933static void934update_affinity(struct ir3_register *reg, physreg_t physreg)935{936if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)937return;938939if (physreg < reg->merge_set_offset)940return;941942reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;943}944945/* Try to find free space for a register without shuffling anything. This uses946* a round-robin algorithm to reduce false dependencies.947*/948static physreg_t949find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,950unsigned align, bool is_source)951{952BITSET_WORD *available =953is_source ? file->available_to_evict : file->available;954955unsigned start = ALIGN(file->start, align) % (file_size - size + align);956unsigned candidate = start;957do {958bool is_available = true;959for (unsigned i = 0; i < size; i++) {960if (!BITSET_TEST(available, candidate + i)) {961is_available = false;962break;963}964}965966if (is_available) {967file->start = (candidate + size) % file_size;968return candidate;969}970971candidate += align;972if (candidate + size > file_size)973candidate = 0;974} while (candidate != start);975976return (physreg_t)~0;977}978979static struct ra_file *980ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)981{982if (reg->flags & IR3_REG_SHARED)983return &ctx->shared;984else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))985return &ctx->full;986else987return &ctx->half;988}989990/* This is the main entrypoint for picking a register. Pick a free register991* for "reg", shuffling around sources if necessary. In the normal case where992* "is_source" is false, this register can overlap with killed sources993* (intervals with "is_killed == true"). If "is_source" is true, then994* is_killed is ignored and the register returned must not overlap with killed995* sources. This must be used for tied registers, because we're actually996* allocating the destination and the tied source at the same time.997*/998999static physreg_t1000get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,1001bool is_source)1002{1003unsigned file_size = reg_file_size(file, reg);1004if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {1005physreg_t preferred_reg =1006reg->merge_set->preferred_reg + reg->merge_set_offset;1007if (preferred_reg < file_size &&1008preferred_reg % reg_elem_size(reg) == 0 &&1009get_reg_specified(file, reg, preferred_reg, is_source))1010return preferred_reg;1011}10121013/* If this register is a subset of a merge set which we have not picked a1014* register for, first try to allocate enough space for the entire merge1015* set.1016*/1017unsigned size = reg_size(reg);1018if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&1019size < reg->merge_set->size) {1020physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,1021reg->merge_set->alignment, is_source);1022if (best_reg != (physreg_t)~0u) {1023best_reg += reg->merge_set_offset;1024return best_reg;1025}1026}10271028/* For ALU and SFU instructions, if the src reg is avail to pick, use it.1029* Because this doesn't introduce unnecessary dependencies, and it1030* potentially avoids needing (ss) syncs for write after read hazards for1031* SFU instructions:1032*/1033if (is_sfu(reg->instr) || is_alu(reg->instr)) {1034for (unsigned i = 0; i < reg->instr->srcs_count; i++) {1035struct ir3_register *src = reg->instr->srcs[i];1036if (!ra_reg_is_src(src))1037continue;1038if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {1039struct ra_interval *src_interval = &ctx->intervals[src->def->name];1040physreg_t src_physreg = ra_interval_get_physreg(src_interval);1041if (src_physreg % reg_elem_size(reg) == 0 &&1042src_physreg + size <= file_size &&1043get_reg_specified(file, reg, src_physreg, is_source))1044return src_physreg;1045}1046}1047}10481049physreg_t best_reg =1050find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);1051if (best_reg != (physreg_t)~0u) {1052return best_reg;1053}10541055/* Ok, we couldn't find anything that fits. Here is where we have to start1056* moving things around to make stuff fit. First try solely evicting1057* registers in the way.1058*/1059unsigned best_eviction_count = ~0;1060for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {1061unsigned eviction_count;1062if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {1063if (eviction_count < best_eviction_count) {1064best_eviction_count = eviction_count;1065best_reg = i;1066}1067}1068}10691070if (best_eviction_count != ~0) {1071ASSERTED bool result = try_evict_regs(1072ctx, file, reg, best_reg, &best_eviction_count, is_source, false);1073assert(result);1074return best_reg;1075}10761077/* Use the dumb fallback only if try_evict_regs() fails. */1078return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),1079is_source);1080}10811082static void1083assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,1084unsigned num)1085{1086if (reg->flags & IR3_REG_ARRAY) {1087reg->array.base = num;1088if (reg->flags & IR3_REG_RELATIV)1089reg->array.offset += num;1090else1091reg->num = num + reg->array.offset;1092} else {1093reg->num = num;1094}1095}10961097static void1098mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)1099{1100struct ra_interval *interval = &ctx->intervals[src->def->name];11011102if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||1103interval->interval.parent ||1104!rb_tree_is_empty(&interval->interval.children))1105return;11061107ra_file_mark_killed(ra_get_file(ctx, src), interval);1108}11091110static void1111insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)1112{1113struct ra_file *file = ra_get_file(ctx, dst);1114struct ra_interval *interval = &ctx->intervals[dst->name];11151116d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));11171118if (!(dst->flags & IR3_REG_UNUSED))1119ra_file_insert(file, interval);11201121assign_reg(dst->instr, dst, ra_interval_get_num(interval));1122}11231124static void1125allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,1126physreg_t physreg)1127{1128struct ra_interval *interval = &ctx->intervals[dst->name];1129update_affinity(dst, physreg);11301131ra_interval_init(interval, dst);1132interval->physreg_start = physreg;1133interval->physreg_end = physreg + reg_size(dst);1134}11351136static void1137allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)1138{1139struct ra_file *file = ra_get_file(ctx, dst);11401141struct ir3_register *tied = dst->tied;1142if (tied) {1143struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];1144struct ra_interval *dst_interval = &ctx->intervals[dst->name];1145physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);1146if (tied_interval->is_killed) {1147/* The easy case: the source is killed, so we can just reuse it1148* for the destination.1149*/1150allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));1151} else {1152/* The source is live-through, so we need to get a free register1153* (which is free for both the source and destination!), copy the1154* original source to it, then use that for the source and1155* destination.1156*/1157physreg_t physreg = get_reg(ctx, file, dst, true);1158allocate_dst_fixed(ctx, dst, physreg);1159array_insert(ctx, ctx->parallel_copies,1160(struct ra_parallel_copy){1161.interval = dst_interval,1162.src = tied_physreg,1163});1164}11651166return;1167}11681169/* All the hard work is done by get_reg here. */1170physreg_t physreg = get_reg(ctx, file, dst, false);11711172allocate_dst_fixed(ctx, dst, physreg);1173}11741175static void1176assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,1177struct ir3_register *src)1178{1179struct ra_interval *interval = &ctx->intervals[src->def->name];1180struct ra_file *file = ra_get_file(ctx, src);11811182struct ir3_register *tied = src->tied;1183physreg_t physreg;1184if (tied) {1185struct ra_interval *tied_interval = &ctx->intervals[tied->name];1186physreg = ra_interval_get_physreg(tied_interval);1187} else {1188physreg = ra_interval_get_physreg(interval);1189}11901191assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));11921193if (src->flags & IR3_REG_FIRST_KILL)1194ra_file_remove(file, interval);1195}11961197/* Insert a parallel copy instruction before the instruction with the parallel1198* copy entries we've built up.1199*/1200static void1201insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)1202{1203if (ctx->parallel_copies_count == 0)1204return;12051206struct ir3_instruction *pcopy =1207ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,1208ctx->parallel_copies_count, ctx->parallel_copies_count);12091210for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {1211struct ra_parallel_copy *entry = &ctx->parallel_copies[i];1212struct ir3_register *reg =1213ir3_dst_create(pcopy, INVALID_REG,1214entry->interval->interval.reg->flags & ~IR3_REG_SSA);1215reg->size = entry->interval->interval.reg->size;1216reg->wrmask = entry->interval->interval.reg->wrmask;1217assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));1218}12191220for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {1221struct ra_parallel_copy *entry = &ctx->parallel_copies[i];1222struct ir3_register *reg =1223ir3_src_create(pcopy, INVALID_REG,1224entry->interval->interval.reg->flags & ~IR3_REG_SSA);1225reg->size = entry->interval->interval.reg->size;1226reg->wrmask = entry->interval->interval.reg->wrmask;1227assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));1228}12291230list_del(&pcopy->node);1231list_addtail(&pcopy->node, &instr->node);1232ctx->parallel_copies_count = 0;1233}12341235static void1236handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)1237{1238/* First, mark sources as going-to-be-killed while allocating the dest. */1239ra_foreach_src (src, instr) {1240mark_src_killed(ctx, src);1241}12421243/* Allocate the destination. */1244ra_foreach_dst (dst, instr) {1245allocate_dst(ctx, dst);1246}12471248/* Now handle sources. Go backward so that in case there are multiple1249* sources with the same def and that def is killed we only remove it at1250* the end.1251*/1252ra_foreach_src_rev (src, instr) {1253assign_src(ctx, instr, src);1254}12551256/* Now finally insert the destination into the map. */1257ra_foreach_dst (dst, instr) {1258insert_dst(ctx, dst);1259}12601261insert_parallel_copy_instr(ctx, instr);1262}12631264static void1265handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)1266{1267struct ir3_register *dst = instr->dsts[0];1268struct ir3_register *src = instr->srcs[0];12691270if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {1271handle_normal_instr(ctx, instr);1272return;1273}12741275struct ra_interval *src_interval = &ctx->intervals[src->def->name];12761277physreg_t physreg = ra_interval_get_physreg(src_interval);1278assign_src(ctx, instr, src);12791280allocate_dst_fixed(1281ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);1282insert_dst(ctx, dst);1283}12841285static void1286handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)1287{1288struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;1289unsigned dst_offset = instr->dsts[0]->merge_set_offset;12901291if (!dst_set || dst_set->regs_count == 1) {1292handle_normal_instr(ctx, instr);1293return;1294}12951296/* We need to check if any of the sources are contained in an interval1297* that is at least as large as the vector. In this case, we should put1298* the vector inside that larger interval. (There should be one1299* unambiguous place to put it, because values sharing the same merge set1300* should be allocated together.) This can happen in a case like:1301*1302* ssa_1 (wrmask=0xf) = ...1303* ssa_2 = split ssa_1 off:01304* ssa_3 = split ssa_1 off:11305* ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_31306* ... = (kill)ssa_11307* ... = (kill)ssa_41308*1309* ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.1310*/1311physreg_t dst_fixed = (physreg_t)~0u;13121313for (unsigned i = 0; i < instr->srcs_count; i++) {1314if (!ra_reg_is_src(instr->srcs[i]))1315continue;13161317if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {1318mark_src_killed(ctx, instr->srcs[i]);1319}13201321struct ir3_register *src = instr->srcs[i];1322struct ra_interval *interval = &ctx->intervals[src->def->name];13231324if (src->def->merge_set != dst_set || interval->is_killed)1325continue;1326while (interval->interval.parent != NULL) {1327interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);1328}1329if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {1330dst_fixed = interval->physreg_start -1331interval->interval.reg->merge_set_offset + dst_offset;1332} else {1333/* For sources whose root interval is smaller than the1334* destination (i.e. the normal case), we will shuffle them1335* around after allocating the destination. Mark them killed so1336* that the destination can be allocated over them, even if they1337* aren't actually killed.1338*/1339ra_file_mark_killed(ra_get_file(ctx, src), interval);1340}1341}13421343if (dst_fixed != (physreg_t)~0u)1344allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);1345else1346allocate_dst(ctx, instr->dsts[0]);13471348/* Remove the temporary is_killed we added */1349for (unsigned i = 0; i < instr->srcs_count; i++) {1350if (!ra_reg_is_src(instr->srcs[i]))1351continue;13521353struct ir3_register *src = instr->srcs[i];1354struct ra_interval *interval = &ctx->intervals[src->def->name];1355while (interval->interval.parent != NULL) {1356interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);1357}13581359/* Filter out cases where it actually should be killed */1360if (interval != &ctx->intervals[src->def->name] ||1361!(src->flags & IR3_REG_KILL))1362interval->is_killed = false;1363}13641365ra_foreach_src_rev (src, instr) {1366assign_src(ctx, instr, src);1367}13681369/* We need to do this before insert_dst(), so that children of the1370* destination which got marked as killed and then shuffled around to make1371* space for the destination have the correct pcopy destination that1372* matches what we assign the source of the collect to in assign_src().1373*1374* TODO: In this case we'll wind up copying the value in the pcopy and1375* then again in the collect. We could avoid one of those by updating the1376* pcopy destination to match up with the final location of the source1377* after the collect and making the collect a no-op. However this doesn't1378* seem to happen often.1379*/1380insert_parallel_copy_instr(ctx, instr);13811382/* Note: insert_dst will automatically shuffle around any intervals that1383* are a child of the collect by making them children of the collect.1384*/13851386insert_dst(ctx, instr->dsts[0]);1387}13881389/* Parallel copies before RA should only be at the end of the block, for1390* phi's. For these we only need to fill in the sources, and then we fill in1391* the destinations in the successor block.1392*/1393static void1394handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)1395{1396ra_foreach_src_rev (src, instr) {1397assign_src(ctx, instr, src);1398}1399}14001401/* Some inputs may need to be precolored. We need to handle those first, so1402* that other non-precolored inputs don't accidentally get allocated over1403* them. Inputs are the very first thing in the shader, so it shouldn't be a1404* problem to allocate them to a specific physreg.1405*/14061407static void1408handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)1409{1410if (instr->dsts[0]->num == INVALID_REG)1411return;14121413struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];1414physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);1415allocate_dst_fixed(ctx, instr->dsts[0], physreg);1416insert_dst(ctx, instr->dsts[0]);1417interval->frozen = true;1418}14191420static void1421handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)1422{1423if (instr->dsts[0]->num != INVALID_REG)1424return;14251426allocate_dst(ctx, instr->dsts[0]);14271428struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);1429struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];1430ra_file_insert(file, interval);1431}14321433static void1434assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)1435{1436struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];1437struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);14381439if (instr->dsts[0]->num == INVALID_REG) {1440assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));1441} else {1442interval->frozen = false;1443}14441445if (instr->dsts[0]->flags & IR3_REG_UNUSED)1446ra_file_remove(file, interval);14471448ra_foreach_src_rev (src, instr)1449assign_src(ctx, instr, src);1450}14511452/* chmask is a bit weird, because it has pre-colored sources due to the need1453* to pass some registers to the next stage. Fortunately there are only at1454* most two, and there should be no other live values by the time we get to1455* this instruction, so we only have to do the minimum and don't need any1456* fancy fallbacks.1457*1458* TODO: Add more complete handling of precolored sources, e.g. for function1459* argument handling. We'd need a way to mark sources as fixed so that they1460* don't get moved around when placing other sources in the fallback case, and1461* a duplication of much of the logic in get_reg(). This also opens another1462* can of worms, e.g. what if the precolored source is a split of a vector1463* which is still live -- this breaks our assumption that splits don't incur1464* any "extra" register requirements and we'd have to break it out of the1465* parent ra_interval.1466*/14671468static void1469handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)1470{1471struct ra_file *file = ra_get_file(ctx, src);1472struct ra_interval *interval = &ctx->intervals[src->def->name];1473physreg_t physreg = ra_reg_get_physreg(src);14741475if (ra_interval_get_num(interval) == src->num)1476return;14771478/* Try evicting stuff in our way if it isn't free. This won't move1479* anything unless it overlaps with our precolored physreg, so we don't1480* have to worry about evicting other precolored sources.1481*/1482if (!get_reg_specified(file, src, physreg, true)) {1483unsigned eviction_count;1484if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,1485false)) {1486unreachable("failed to evict for precolored source!");1487return;1488}1489}14901491ra_move_interval(ctx, file, interval, physreg);1492}14931494static void1495handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)1496{1497/* Note: we purposely don't mark sources as killed, so that we can reuse1498* some of the get_reg() machinery as-if the source is a destination.1499* Marking it as killed would make e.g. get_reg_specified() wouldn't work1500* correctly.1501*/1502ra_foreach_src (src, instr) {1503assert(src->num != INVALID_REG);1504handle_precolored_source(ctx, src);1505}15061507ra_foreach_src (src, instr) {1508struct ra_file *file = ra_get_file(ctx, src);1509struct ra_interval *interval = &ctx->intervals[src->def->name];1510if (src->flags & IR3_REG_FIRST_KILL)1511ra_file_remove(file, interval);1512}15131514insert_parallel_copy_instr(ctx, instr);1515}15161517static physreg_t1518read_register(struct ra_ctx *ctx, struct ir3_block *block,1519struct ir3_register *def)1520{1521struct ra_block_state *state = &ctx->blocks[block->index];1522if (state->renames) {1523struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);1524if (entry) {1525return (physreg_t)(uintptr_t)entry->data;1526}1527}15281529return ra_reg_get_physreg(def);1530}15311532static void1533handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)1534{1535physreg_t physreg = ~0;1536for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {1537struct ir3_block *pred = ctx->block->predecessors[i];1538struct ra_block_state *pred_state = &ctx->blocks[pred->index];15391540if (!pred_state->visited ||1541(pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))1542continue;15431544physreg = read_register(ctx, pred, def);1545break;1546}15471548assert(physreg != (physreg_t)~0);15491550struct ra_interval *interval = &ctx->intervals[def->name];1551struct ra_file *file = ra_get_file(ctx, def);1552ra_interval_init(interval, def);1553interval->physreg_start = physreg;1554interval->physreg_end = physreg + reg_size(def);1555ra_file_insert(file, interval);1556}15571558static void1559handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)1560{1561/* Skip parallelcopy's which in the original program are only used as phi1562* arguments. Even though phi arguments are live out, they are only1563* assigned when the phi is.1564*/1565if (def->instr->opc == OPC_META_PARALLEL_COPY)1566return;15671568struct ra_block_state *state = &ctx->blocks[ctx->block->index];1569struct ra_interval *interval = &ctx->intervals[def->name];1570physreg_t physreg = ra_interval_get_physreg(interval);1571if (physreg != ra_reg_get_physreg(def)) {1572if (!state->renames)1573state->renames = _mesa_pointer_hash_table_create(ctx);1574_mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);1575}1576}15771578static void1579handle_phi(struct ra_ctx *ctx, struct ir3_register *def)1580{1581struct ra_file *file = ra_get_file(ctx, def);1582struct ra_interval *interval = &ctx->intervals[def->name];15831584/* phis are always scalar, so they should already be the smallest possible1585* size. However they may be coalesced with other live-in values/phi1586* nodes, so check for that here.1587*/1588struct ir3_reg_interval *parent_ir3 =1589ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);1590physreg_t physreg;1591if (parent_ir3) {1592struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);1593physreg = ra_interval_get_physreg(parent) +1594(def->interval_start - parent_ir3->reg->interval_start);1595} else {1596physreg = get_reg(ctx, file, def, false);1597}15981599allocate_dst_fixed(ctx, def, physreg);16001601ra_file_insert(file, interval);1602}16031604static void1605assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)1606{1607struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);1608struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];1609assert(!interval->interval.parent);1610unsigned num = ra_interval_get_num(interval);1611assign_reg(phi, phi->dsts[0], num);16121613/* Assign the parallelcopy sources of this phi */1614for (unsigned i = 0; i < phi->srcs_count; i++) {1615if (phi->srcs[i]->def) {1616assign_reg(phi, phi->srcs[i], num);1617assign_reg(phi, phi->srcs[i]->def, num);1618}1619}16201621if (phi->dsts[0]->flags & IR3_REG_UNUSED)1622ra_file_remove(file, interval);1623}16241625/* When we split a live range, we sometimes need to emit fixup code at the end1626* of a block. For example, something like:1627*1628* a = ...1629* if (...) {1630* ...1631* a' = a1632* b = ... // a evicted to make room for b1633* ...1634* }1635* ... = a1636*1637* When we insert the copy to a' in insert_parallel_copy_instr(), this forces1638* to insert another copy "a = a'" at the end of the if. Normally this would1639* also entail adding a phi node, but since we're about to go out of SSA1640* anyway we just insert an extra move. Note, however, that "b" might be used1641* in a phi node at the end of the if and share registers with "a", so we1642* have to be careful to extend any preexisting parallelcopy instruction1643* instead of creating our own in order to guarantee that they properly get1644* swapped.1645*/16461647static void1648insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,1649struct ir3_register *reg)1650{1651struct ir3_instruction *old_pcopy = NULL;1652if (!list_is_empty(&block->instr_list)) {1653struct ir3_instruction *last =1654LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);1655if (last->opc == OPC_META_PARALLEL_COPY)1656old_pcopy = last;1657}16581659unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;1660struct ir3_instruction *pcopy = ir3_instr_create(1661block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);16621663for (unsigned i = 0; i < old_pcopy_srcs; i++) {1664old_pcopy->dsts[i]->instr = pcopy;1665pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];1666}16671668struct ir3_register *dst_reg =1669ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);1670dst_reg->wrmask = reg->wrmask;1671dst_reg->size = reg->size;1672assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));16731674for (unsigned i = 0; i < old_pcopy_srcs; i++) {1675pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];1676}16771678struct ir3_register *src_reg =1679ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);1680src_reg->wrmask = reg->wrmask;1681src_reg->size = reg->size;1682assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));16831684if (old_pcopy)1685list_del(&old_pcopy->node);1686}16871688static void1689insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)1690{1691physreg_t physreg = ra_interval_get_physreg(interval);16921693bool shared = interval->interval.reg->flags & IR3_REG_SHARED;1694struct ir3_block **predecessors =1695shared ? ctx->block->physical_predecessors : ctx->block->predecessors;1696unsigned predecessors_count = shared1697? ctx->block->physical_predecessors_count1698: ctx->block->predecessors_count;16991700for (unsigned i = 0; i < predecessors_count; i++) {1701struct ir3_block *pred = predecessors[i];1702struct ra_block_state *pred_state = &ctx->blocks[pred->index];17031704if (!pred_state->visited)1705continue;17061707physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);1708if (pred_reg != physreg) {1709insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);17101711/* This is a bit tricky, but when visiting the destination of a1712* physical-only edge, we have two predecessors (the if and the1713* header block) and both have multiple successors. We pick the1714* register for all live-ins from the normal edge, which should1715* guarantee that there's no need for shuffling things around in1716* the normal predecessor as long as there are no phi nodes, but1717* we still may need to insert fixup code in the physical1718* predecessor (i.e. the last block of the if) and that has1719* another successor (the block after the if) so we need to update1720* the renames state for when we process the other successor. This1721* crucially depends on the other successor getting processed1722* after this.1723*1724* For normal (non-physical) edges we disallow critical edges so1725* that hacks like this aren't necessary.1726*/1727if (!pred_state->renames)1728pred_state->renames = _mesa_pointer_hash_table_create(ctx);1729_mesa_hash_table_insert(pred_state->renames, interval->interval.reg,1730(void *)(uintptr_t)physreg);1731}1732}1733}17341735static void1736insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)1737{1738BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];1739rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,1740physreg_node) {1741/* Skip phi nodes. This needs to happen after phi nodes are allocated,1742* because we may have to move live-ins around to make space for phi1743* nodes, but we shouldn't be handling phi nodes here.1744*/1745if (BITSET_TEST(live_in, interval->interval.reg->name))1746insert_live_in_move(ctx, interval);1747}1748}17491750static void1751insert_entry_regs(struct ra_block_state *state, struct ra_file *file)1752{1753rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,1754physreg_node) {1755_mesa_hash_table_insert(state->entry_regs, interval->interval.reg,1756(void *)(uintptr_t)interval->physreg_start);1757}1758}17591760static void1761insert_live_in_moves(struct ra_ctx *ctx)1762{1763insert_file_live_in_moves(ctx, &ctx->full);1764insert_file_live_in_moves(ctx, &ctx->half);1765insert_file_live_in_moves(ctx, &ctx->shared);17661767/* If not all predecessors are visited, insert live-in regs so that1768* insert_live_out_moves() will work.1769*/1770bool all_preds_visited = true;1771for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {1772if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {1773all_preds_visited = false;1774break;1775}1776}17771778if (!all_preds_visited) {1779struct ra_block_state *state = &ctx->blocks[ctx->block->index];1780state->entry_regs = _mesa_pointer_hash_table_create(ctx);17811782insert_entry_regs(state, &ctx->full);1783insert_entry_regs(state, &ctx->half);1784insert_entry_regs(state, &ctx->shared);1785}1786}17871788static void1789insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)1790{1791for (unsigned i = 0; i < 2; i++) {1792if (!ctx->block->successors[i])1793continue;17941795struct ir3_block *succ = ctx->block->successors[i];1796struct ra_block_state *succ_state = &ctx->blocks[succ->index];17971798if (!succ_state->visited)1799continue;18001801struct hash_entry *entry = _mesa_hash_table_search(1802succ_state->entry_regs, interval->interval.reg);1803if (!entry)1804continue;18051806physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;1807if (new_reg != interval->physreg_start) {1808insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,1809interval->interval.reg);1810}1811}1812}18131814static void1815insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)1816{1817rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,1818physreg_node) {1819insert_live_out_move(ctx, interval);1820}1821}18221823static void1824insert_live_out_moves(struct ra_ctx *ctx)1825{1826insert_file_live_out_moves(ctx, &ctx->full);1827insert_file_live_out_moves(ctx, &ctx->half);1828insert_file_live_out_moves(ctx, &ctx->shared);1829}18301831static void1832handle_block(struct ra_ctx *ctx, struct ir3_block *block)1833{1834ctx->block = block;18351836/* Reset the register files from the last block */1837ra_file_init(&ctx->full);1838ra_file_init(&ctx->half);1839ra_file_init(&ctx->shared);18401841bool unreachable = false;1842if (block != ir3_start_block(ctx->ir)) {1843unreachable = true;1844for (unsigned i = 0; i < block->predecessors_count; i++) {1845struct ra_block_state *pred_state =1846&ctx->blocks[block->predecessors[i]->index];1847if (!pred_state->logical_unreachable) {1848unreachable = false;1849break;1850}1851}1852}18531854ctx->blocks[block->index].logical_unreachable = unreachable;18551856/* Handle live-ins, phis, and input meta-instructions. These all appear1857* live at the beginning of the block, and interfere with each other1858* therefore need to be allocated "in parallel". This means that we1859* have to allocate all of them, inserting them into the file, and then1860* delay updating the IR until all of them are allocated.1861*1862* Handle precolored inputs first, because we need to make sure that other1863* inputs don't overwrite them. We shouldn't have both live-ins/phi nodes1864* and inputs at the same time, because the first block doesn't have1865* predecessors. Therefore handle_live_in doesn't have to worry about1866* them.1867*/18681869foreach_instr (instr, &block->instr_list) {1870if (instr->opc == OPC_META_INPUT)1871handle_precolored_input(ctx, instr);1872else1873break;1874}18751876unsigned name;1877BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],1878ctx->live->definitions_count) {1879struct ir3_register *reg = ctx->live->definitions[name];1880if (unreachable && !(reg->flags & IR3_REG_SHARED))1881continue;1882handle_live_in(ctx, reg);1883}18841885foreach_instr (instr, &block->instr_list) {1886if (instr->opc == OPC_META_PHI)1887handle_phi(ctx, instr->dsts[0]);1888else if (instr->opc == OPC_META_INPUT ||1889instr->opc == OPC_META_TEX_PREFETCH)1890handle_input(ctx, instr);1891else1892break;1893}18941895/* After this point, every live-in/phi/input has an interval assigned to1896* it. We delay actually assigning values until everything has been1897* allocated, so we can simply ignore any parallel copy entries created1898* when shuffling them around.1899*/1900ctx->parallel_copies_count = 0;19011902insert_live_in_moves(ctx);19031904if (RA_DEBUG) {1905printf("after live-in block %u:\n", block->index);1906ra_ctx_dump(ctx);1907}19081909/* Now we're done with processing live-ins, and can handle the body of the1910* block.1911*/1912foreach_instr (instr, &block->instr_list) {1913if (RA_DEBUG) {1914printf("processing: ");1915ir3_print_instr(instr);1916}19171918if (instr->opc == OPC_META_PHI)1919assign_phi(ctx, instr);1920else if (instr->opc == OPC_META_INPUT ||1921instr->opc == OPC_META_TEX_PREFETCH)1922assign_input(ctx, instr);1923else if (instr->opc == OPC_META_SPLIT)1924handle_split(ctx, instr);1925else if (instr->opc == OPC_META_COLLECT)1926handle_collect(ctx, instr);1927else if (instr->opc == OPC_META_PARALLEL_COPY)1928handle_pcopy(ctx, instr);1929else if (instr->opc == OPC_CHMASK)1930handle_chmask(ctx, instr);1931else1932handle_normal_instr(ctx, instr);19331934if (RA_DEBUG)1935ra_ctx_dump(ctx);1936}19371938insert_live_out_moves(ctx);19391940BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],1941ctx->live->definitions_count) {1942struct ir3_register *reg = ctx->live->definitions[name];1943handle_live_out(ctx, reg);1944}19451946ctx->blocks[block->index].visited = true;1947}19481949static unsigned1950calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)1951{1952/* Registers are allocated in units of vec4, so switch from units of1953* half-regs to vec4.1954*/1955unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);19561957bool double_threadsize = ir3_should_double_threadsize(v, reg_count);19581959unsigned target = reg_count;1960unsigned reg_independent_max_waves =1961ir3_get_reg_independent_max_waves(v, double_threadsize);1962unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(1963v->shader->compiler, reg_count, double_threadsize);1964unsigned target_waves =1965MIN2(reg_independent_max_waves, reg_dependent_max_waves);19661967while (target <= RA_FULL_SIZE / (2 * 4) &&1968ir3_should_double_threadsize(v, target) == double_threadsize &&1969ir3_get_reg_dependent_max_waves(v->shader->compiler, target,1970double_threadsize) >= target_waves)1971target++;19721973return (target - 1) * 2 * 4;1974}19751976int1977ir3_ra(struct ir3_shader_variant *v)1978{1979ir3_calc_dominance(v->ir);19801981ir3_create_parallel_copies(v->ir);19821983struct ir3_liveness *live = ir3_calc_liveness(v);19841985ir3_debug_print(v->ir, "AFTER: create_parallel_copies");19861987ir3_merge_regs(live, v->ir);19881989struct ir3_pressure max_pressure;1990ir3_calc_pressure(v, live, &max_pressure);1991d("max pressure:");1992d("\tfull: %u", max_pressure.full);1993d("\thalf: %u", max_pressure.half);1994d("\tshared: %u", max_pressure.shared);19951996if (v->mergedregs) {1997max_pressure.full += max_pressure.half;1998max_pressure.half = 0;1999}20002001if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||2002max_pressure.shared > RA_SHARED_SIZE) {2003d("max pressure exceeded!");2004return 1;2005}20062007struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);20082009ctx->ir = v->ir;2010ctx->merged_regs = v->mergedregs;2011ctx->compiler = v->shader->compiler;2012ctx->stage = v->type;2013ctx->live = live;2014ctx->intervals =2015rzalloc_array(ctx, struct ra_interval, live->definitions_count);2016ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);20172018ctx->full.size = calc_target_full_pressure(v, max_pressure.full);2019d("full size: %u", ctx->full.size);20202021if (!v->mergedregs)2022ctx->half.size = RA_HALF_SIZE;20232024ctx->shared.size = RA_SHARED_SIZE;20252026foreach_block (block, &v->ir->block_list)2027handle_block(ctx, block);20282029ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);20302031/* Strip array-ness and SSA-ness at the end, because various helpers still2032* need to work even on definitions that have already been assigned. For2033* example, we need to preserve array-ness so that array live-ins have the2034* right size.2035*/2036foreach_block (block, &v->ir->block_list) {2037foreach_instr (instr, &block->instr_list) {2038for (unsigned i = 0; i < instr->dsts_count; i++) {2039instr->dsts[i]->flags &= ~IR3_REG_SSA;20402041/* Parallel copies of array registers copy the whole register,2042* and we need some way to let the parallel copy code know2043* that this was an array whose size is determined by2044* reg->size. So keep the array flag on those.2045*/2046if (!is_meta(instr))2047instr->dsts[i]->flags &= ~IR3_REG_ARRAY;2048}20492050for (unsigned i = 0; i < instr->srcs_count; i++) {2051instr->srcs[i]->flags &= ~IR3_REG_SSA;20522053if (!is_meta(instr))2054instr->srcs[i]->flags &= ~IR3_REG_ARRAY;2055}2056}2057}20582059ir3_debug_print(v->ir, "AFTER: register allocation");20602061ir3_lower_copies(v);20622063ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");20642065ralloc_free(ctx);2066ralloc_free(live);2067return 0;2068}206920702071