Path: blob/21.2-virgl/src/panfrost/bifrost/bi_ra.c
4564 views
/*1* Copyright (C) 2020 Collabora Ltd.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors (Collabora):23* Alyssa Rosenzweig <[email protected]>24*/2526#include "compiler.h"27#include "bi_builder.h"28#include "util/u_memory.h"2930struct lcra_state {31unsigned node_count;32uint64_t *affinity;3334/* Linear constraints imposed. Nested array sized upfront, organized as35* linear[node_left][node_right]. That is, calculate indices as:36*37* Each element is itself a bit field denoting whether (c_j - c_i) bias38* is present or not, including negative biases.39*40* Note for Bifrost, there are 4 components so the bias is in range41* [-3, 3] so encoded by 8-bit field. */4243uint8_t *linear;4445/* Before solving, forced registers; after solving, solutions. */46unsigned *solutions;4748/** Node which caused register allocation to fail */49unsigned spill_node;50};5152/* This module is an implementation of "Linearly Constrained53* Register Allocation". The paper is available in PDF form54* (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX55* (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)56*/5758static struct lcra_state *59lcra_alloc_equations(unsigned node_count)60{61struct lcra_state *l = calloc(1, sizeof(*l));6263l->node_count = node_count;6465l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);66l->solutions = calloc(sizeof(l->solutions[0]), node_count);67l->affinity = calloc(sizeof(l->affinity[0]), node_count);6869memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);7071return l;72}7374static void75lcra_free(struct lcra_state *l)76{77free(l->linear);78free(l->affinity);79free(l->solutions);80free(l);81}8283static void84lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)85{86if (i == j)87return;8889uint8_t constraint_fw = 0;90uint8_t constraint_bw = 0;9192for (unsigned D = 0; D < 4; ++D) {93if (cmask_i & (cmask_j << D)) {94constraint_bw |= (1 << (3 + D));95constraint_fw |= (1 << (3 - D));96}9798if (cmask_i & (cmask_j >> D)) {99constraint_fw |= (1 << (3 + D));100constraint_bw |= (1 << (3 - D));101}102}103104l->linear[j * l->node_count + i] |= constraint_fw;105l->linear[i * l->node_count + j] |= constraint_bw;106}107108static bool109lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)110{111uint8_t *row = &l->linear[i * l->node_count];112signed constant = solutions[i];113114for (unsigned j = 0; j < l->node_count; ++j) {115if (solutions[j] == ~0) continue;116117signed lhs = solutions[j] - constant;118119if (lhs < -3 || lhs > 3)120continue;121122if (row[j] & (1 << (lhs + 3)))123return false;124}125126return true;127}128129static bool130lcra_solve(struct lcra_state *l)131{132for (unsigned step = 0; step < l->node_count; ++step) {133if (l->solutions[step] != ~0) continue;134if (l->affinity[step] == 0) continue;135136bool succ = false;137138u_foreach_bit64(r, l->affinity[step]) {139l->solutions[step] = r;140141if (lcra_test_linear(l, l->solutions, step)) {142succ = true;143break;144}145}146147/* Out of registers - prepare to spill */148if (!succ) {149l->spill_node = step;150return false;151}152}153154return true;155}156157/* Register spilling is implemented with a cost-benefit system. Costs are set158* by the user. Benefits are calculated from the constraints. */159160static unsigned161lcra_count_constraints(struct lcra_state *l, unsigned i)162{163unsigned count = 0;164uint8_t *constraints = &l->linear[i * l->node_count];165166for (unsigned j = 0; j < l->node_count; ++j)167count += util_bitcount(constraints[j]);168169return count;170}171172/* Construct an affinity mask such that the vector with `count` elements does173* not intersect any of the registers in the bitset `clobber`. In other words,174* an allocated register r needs to satisfy for each i < count: a + i != b.175* Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the176* entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where177* that union is the desired clobber set. That may be written equivalently as178* the union over i < n of (B - i), where subtraction is defined elementwise179* and corresponds to a shift of the entire bitset.180*/181182static uint64_t183bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)184{185uint64_t clobbered = 0;186187for (unsigned i = 0; i < count; ++i)188clobbered |= (clobber >> i);189190/* Don't allocate past the end of the register file */191if (count > 1) {192unsigned excess = count - 1;193uint64_t mask = BITFIELD_MASK(excess);194clobbered |= mask << (64 - excess);195196if (split_file)197clobbered |= mask << (16 - excess);198}199200/* Don't allocate the middle if we split out the middle */201if (split_file)202clobbered |= BITFIELD64_MASK(32) << 16;203204/* We can use a register iff it's not clobberred */205return ~clobbered;206}207208static void209bi_mark_interference(bi_block *block, struct lcra_state *l, uint16_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file)210{211bi_foreach_instr_in_block_rev(block, ins) {212/* Mark all registers live after the instruction as213* interfering with the destination */214215bi_foreach_dest(ins, d) {216unsigned node = bi_get_node(ins->dest[d]);217218if (node >= node_count)219continue;220221/* Don't allocate to anything that's read later as a222* preloaded register. The affinity is the intersection223* of affinity masks for each write. Since writes have224* offsets, but the affinity is for the whole node, we225* need to offset the affinity opposite the write226* offset, so we shift right. */227unsigned count = bi_count_write_registers(ins, d);228unsigned offset = ins->dest[d].offset;229uint64_t affinity = bi_make_affinity(preload_live, count, split_file);230l->affinity[node] &= (affinity >> offset);231232for (unsigned i = 0; i < node_count; ++i) {233if (live[i]) {234lcra_add_node_interference(l, node,235bi_writemask(ins, d), i, live[i]);236}237}238}239240if (!is_blend && ins->op == BI_OPCODE_BLEND) {241/* Blend shaders might clobber r0-r15, r48. */242uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);243244for (unsigned i = 0; i < node_count; ++i) {245if (live[i])246l->affinity[i] &= ~clobber;247}248}249250/* Update live_in */251preload_live = bi_postra_liveness_ins(preload_live, ins);252bi_liveness_ins_update(live, ins, node_count);253}254255block->reg_live_in = preload_live;256}257258static void259bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)260{261unsigned node_count = bi_max_temp(ctx);262263bi_compute_liveness(ctx);264bi_postra_liveness(ctx);265266bi_foreach_block_rev(ctx, _blk) {267bi_block *blk = (bi_block *) _blk;268uint16_t *live = mem_dup(_blk->live_out, node_count * sizeof(uint16_t));269270bi_mark_interference(blk, l, live, blk->reg_live_out,271node_count, ctx->inputs->is_blend, !full_regs);272273free(live);274}275}276277static struct lcra_state *278bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)279{280unsigned node_count = bi_max_temp(ctx);281struct lcra_state *l = lcra_alloc_equations(node_count);282283/* Blend shaders are restricted to R0-R15. Other shaders at full284* occupancy also can access R48-R63. At half occupancy they can access285* the whole file. */286287uint64_t default_affinity =288ctx->inputs->is_blend ? BITFIELD64_MASK(16) :289full_regs ? BITFIELD64_MASK(64) :290(BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));291292bi_foreach_instr_global(ctx, ins) {293bi_foreach_dest(ins, d) {294unsigned dest = bi_get_node(ins->dest[d]);295296/* Blend shaders expect the src colour to be in r0-r3 */297if (ins->op == BI_OPCODE_BLEND &&298!ctx->inputs->is_blend) {299unsigned node = bi_get_node(ins->src[0]);300assert(node < node_count);301l->solutions[node] = 0;302}303304if (dest < node_count)305l->affinity[dest] = default_affinity;306}307308}309310bi_compute_interference(ctx, l, full_regs);311312*success = lcra_solve(l);313314return l;315}316317static bi_index318bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)319{320/* Offsets can only be applied when we register allocated an index, or321* alternatively for FAU's encoding */322323ASSERTED bool is_offset = (index.offset > 0) &&324(index.type != BI_INDEX_FAU);325unsigned node_count = bi_max_temp(ctx);326327/* Did we run RA for this index at all */328if (bi_get_node(index) >= node_count) {329assert(!is_offset);330return index;331}332333/* LCRA didn't bother solving this index (how lazy!) */334signed solution = l->solutions[bi_get_node(index)];335if (solution < 0) {336assert(!is_offset);337return index;338}339340/* todo: do we want to compose with the subword swizzle? */341bi_index new_index = bi_register(solution + index.offset);342new_index.swizzle = index.swizzle;343new_index.abs = index.abs;344new_index.neg = index.neg;345return new_index;346}347348static void349bi_install_registers(bi_context *ctx, struct lcra_state *l)350{351bi_foreach_instr_global(ctx, ins) {352bi_foreach_dest(ins, d)353ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);354355bi_foreach_src(ins, s)356ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);357}358}359360static void361bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)362{363bi_foreach_src(ins, i) {364if (bi_is_equiv(ins->src[i], old)) {365ins->src[i].type = new.type;366ins->src[i].reg = new.reg;367ins->src[i].value = new.value;368}369}370}371372/* If register allocation fails, find the best spill node */373374static signed375bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)376{377/* Pick a node satisfying bi_spill_register's preconditions */378BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));379380bi_foreach_instr_global(ctx, ins) {381bi_foreach_dest(ins, d) {382unsigned node = bi_get_node(ins->dest[d]);383384if (node < l->node_count && ins->no_spill)385BITSET_SET(no_spill, node);386}387}388389unsigned best_benefit = 0.0;390signed best_node = -1;391392for (unsigned i = 0; i < l->node_count; ++i) {393if (BITSET_TEST(no_spill, i)) continue;394395/* Only spill nodes that interfere with the node failing396* register allocation. It's pointless to spill anything else */397if (!l->linear[(l->spill_node * l->node_count) + i]) continue;398399unsigned benefit = lcra_count_constraints(l, i);400401if (benefit > best_benefit) {402best_benefit = benefit;403best_node = i;404}405}406407free(no_spill);408return best_node;409}410411static unsigned412bi_count_read_index(bi_instr *I, bi_index index)413{414unsigned max = 0;415416bi_foreach_src(I, s) {417if (bi_is_equiv(I->src[s], index)) {418unsigned count = bi_count_read_registers(I, s);419max = MAX2(max, count + I->src[s].offset);420}421}422423return max;424}425426/* Once we've chosen a spill node, spill it and returns bytes spilled */427428static unsigned429bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)430{431bi_builder b = { .shader = ctx };432unsigned channels = 0;433434/* Spill after every store, fill before every load */435bi_foreach_instr_global_safe(ctx, I) {436bi_foreach_dest(I, d) {437if (!bi_is_equiv(I->dest[d], index)) continue;438439unsigned extra = I->dest[d].offset;440bi_index tmp = bi_temp(ctx);441442I->dest[d] = bi_replace_index(I->dest[d], tmp);443I->no_spill = true;444445unsigned count = bi_count_write_registers(I, d);446unsigned bits = count * 32;447448b.cursor = bi_after_instr(I);449bi_index loc = bi_imm_u32(offset + 4 * extra);450bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);451452ctx->spills++;453channels = MAX2(channels, extra + count);454}455456if (bi_has_arg(I, index)) {457b.cursor = bi_before_instr(I);458bi_index tmp = bi_temp(ctx);459460unsigned bits = bi_count_read_index(I, index) * 32;461bi_rewrite_index_src_single(I, index, tmp);462463bi_instr *ld = bi_load_to(&b, bits, tmp,464bi_imm_u32(offset), bi_zero(), BI_SEG_TL);465ld->no_spill = true;466ctx->fills++;467}468}469470return (channels * 4);471}472473void474bi_register_allocate(bi_context *ctx)475{476struct lcra_state *l = NULL;477bool success = false;478479unsigned iter_count = 1000; /* max iterations */480481/* Number of bytes of memory we've spilled into */482unsigned spill_count = ctx->info->tls_size;483484/* Try with reduced register pressure to improve thread count on v7 */485if (ctx->arch == 7) {486bi_invalidate_liveness(ctx);487l = bi_allocate_registers(ctx, &success, false);488489if (success) {490ctx->info->work_reg_count = 32;491} else if (!success) {492lcra_free(l);493l = NULL;494}495}496497/* Otherwise, use the register file and spill until we succeed */498while (!success && ((iter_count--) > 0)) {499bi_invalidate_liveness(ctx);500l = bi_allocate_registers(ctx, &success, true);501502if (success) {503ctx->info->work_reg_count = 64;504} else {505signed spill_node = bi_choose_spill_node(ctx, l);506lcra_free(l);507l = NULL;508509if (spill_node == -1)510unreachable("Failed to choose spill node\n");511512spill_count += bi_spill_register(ctx,513bi_node_to_index(spill_node, bi_max_temp(ctx)),514spill_count);515}516}517518assert(success);519520ctx->info->tls_size = spill_count;521bi_install_registers(ctx, l);522523lcra_free(l);524}525526527