Path: blob/21.2-virgl/src/broadcom/compiler/vir_register_allocate.c
7405 views
/*1* Copyright © 2014 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "util/ralloc.h"24#include "util/register_allocate.h"25#include "common/v3d_device_info.h"26#include "v3d_compiler.h"2728#define QPU_R(i) { .magic = false, .index = i }2930#define ACC_INDEX 031#define ACC_COUNT 632#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)33#define PHYS_COUNT 643435static inline bool36qinst_writes_tmu(const struct v3d_device_info *devinfo,37struct qinst *inst)38{39return (inst->dst.file == QFILE_MAGIC &&40v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||41inst->qpu.sig.wrtmuc;42}4344static bool45is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,46struct qinst *inst, struct qblock *block)47{48if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&49inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {50return true;51}5253if (!inst->qpu.sig.ldtmu)54return false;5556list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,57&block->instructions, link) {58if (scan_inst->qpu.sig.ldtmu)59return false;6061if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&62inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {63return true;64}6566if (qinst_writes_tmu(devinfo, scan_inst))67return true;68}6970return true;71}7273static bool74vir_is_mov_uniform(struct v3d_compile *c, int temp)75{76struct qinst *def = c->defs[temp];7778return def && def->qpu.sig.ldunif;79}8081static int82v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,83uint32_t *temp_to_node)84{85const float tmu_scale = 5;86float block_scale = 1.0;87float spill_costs[c->num_temps];88bool in_tmu_operation = false;89bool started_last_seg = false;9091for (unsigned i = 0; i < c->num_temps; i++)92spill_costs[i] = 0.0;9394/* XXX: Scale the cost up when inside of a loop. */95vir_for_each_block(block, c) {96vir_for_each_inst(inst, block) {97/* We can't insert new thread switches after98* starting output writes.99*/100bool no_spilling =101c->threads > 1 && started_last_seg;102103/* Discourage spilling of TMU operations */104for (int i = 0; i < vir_get_nsrc(inst); i++) {105if (inst->src[i].file != QFILE_TEMP)106continue;107108int temp = inst->src[i].index;109if (vir_is_mov_uniform(c, temp)) {110spill_costs[temp] += block_scale;111} else if (!no_spilling) {112float tmu_op_scale = in_tmu_operation ?1133.0 : 1.0;114spill_costs[temp] += (block_scale *115tmu_scale *116tmu_op_scale);117} else {118BITSET_CLEAR(c->spillable, temp);119}120}121122if (inst->dst.file == QFILE_TEMP) {123int temp = inst->dst.index;124125if (vir_is_mov_uniform(c, temp)) {126/* We just rematerialize the unform127* later.128*/129} else if (!no_spilling) {130spill_costs[temp] += (block_scale *131tmu_scale);132} else {133BITSET_CLEAR(c->spillable, temp);134}135}136137/* Refuse to spill a ldvary's dst, because that means138* that ldvary's r5 would end up being used across a139* thrsw.140*/141if (inst->qpu.sig.ldvary) {142assert(inst->dst.file == QFILE_TEMP);143BITSET_CLEAR(c->spillable, inst->dst.index);144}145146if (inst->is_last_thrsw)147started_last_seg = true;148149if (v3d_qpu_writes_vpm(&inst->qpu) ||150v3d_qpu_uses_tlb(&inst->qpu))151started_last_seg = true;152153/* Track when we're in between a TMU setup and the154* final LDTMU or TMUWT from that TMU setup. We155* penalize spills during that time.156*/157if (is_end_of_tmu_sequence(c->devinfo, inst, block))158in_tmu_operation = false;159160if (qinst_writes_tmu(c->devinfo, inst))161in_tmu_operation = true;162}163}164165for (unsigned i = 0; i < c->num_temps; i++) {166if (BITSET_TEST(c->spillable, i))167ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);168}169170return ra_get_best_spill_node(g);171}172173/* The spill offset for this thread takes a bit of setup, so do it once at174* program start.175*/176void177v3d_setup_spill_base(struct v3d_compile *c)178{179c->cursor = vir_before_block(vir_entry_block(c));180181int start_num_temps = c->num_temps;182183/* Each thread wants to be in a separate region of the scratch space184* so that the QPUs aren't fighting over cache lines. We have the185* driver keep a single global spill BO rather than186* per-spilling-program BOs, so we need a uniform from the driver for187* what the per-thread scale is.188*/189struct qreg thread_offset =190vir_UMUL(c,191vir_TIDX(c),192vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));193194/* Each channel in a reg is 4 bytes, so scale them up by that. */195struct qreg element_offset = vir_SHL(c, vir_EIDX(c),196vir_uniform_ui(c, 2));197198c->spill_base = vir_ADD(c,199vir_ADD(c, thread_offset, element_offset),200vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));201202/* Make sure that we don't spill the spilling setup instructions. */203for (int i = start_num_temps; i < c->num_temps; i++)204BITSET_CLEAR(c->spillable, i);205206c->cursor = vir_after_block(c->cur_block);207}208209static void210v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)211{212vir_ADD_dest(c, vir_reg(QFILE_MAGIC,213V3D_QPU_WADDR_TMUA),214c->spill_base,215vir_uniform_ui(c, spill_offset));216}217218219static void220v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,221struct qinst *position, uint32_t spill_offset)222{223c->cursor = vir_after_inst(position);224inst->dst = vir_get_temp(c);225vir_MOV_dest(c, vir_reg(QFILE_MAGIC,226V3D_QPU_WADDR_TMUD),227inst->dst);228v3d_emit_spill_tmua(c, spill_offset);229vir_emit_thrsw(c);230vir_TMUWT(c);231c->spills++;232c->tmu_dirty_rcl = true;233}234235static void236v3d_spill_reg(struct v3d_compile *c, int spill_temp)237{238c->spill_count++;239240bool is_uniform = vir_is_mov_uniform(c, spill_temp);241242uint32_t spill_offset = 0;243244if (!is_uniform) {245spill_offset = c->spill_size;246c->spill_size += V3D_CHANNELS * sizeof(uint32_t);247248if (spill_offset == 0)249v3d_setup_spill_base(c);250}251252struct qinst *last_thrsw = c->last_thrsw;253assert(!last_thrsw || last_thrsw->is_last_thrsw);254255int start_num_temps = c->num_temps;256257int uniform_index = ~0;258if (is_uniform) {259struct qinst *orig_unif = c->defs[spill_temp];260uniform_index = orig_unif->uniform;261}262263/* We must disable the ldunif optimization if we are spilling uniforms */264bool had_disable_ldunif_opt = c->disable_ldunif_opt;265c->disable_ldunif_opt = true;266267struct qinst *start_of_tmu_sequence = NULL;268struct qinst *postponed_spill = NULL;269vir_for_each_block(block, c) {270vir_for_each_inst_safe(inst, block) {271/* Track when we're in between a TMU setup and the final272* LDTMU or TMUWT from that TMU setup. We can't spill/fill any273* temps during that time, because that involves inserting a274* new TMU setup/LDTMU sequence, so we postpone the spill or275* move the fill up to not intrude in the middle of the TMU276* sequence.277*/278if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {279if (postponed_spill) {280v3d_emit_tmu_spill(c, postponed_spill,281inst, spill_offset);282}283284start_of_tmu_sequence = NULL;285postponed_spill = NULL;286}287288if (!start_of_tmu_sequence &&289qinst_writes_tmu(c->devinfo, inst)) {290start_of_tmu_sequence = inst;291}292293/* fills */294for (int i = 0; i < vir_get_nsrc(inst); i++) {295if (inst->src[i].file != QFILE_TEMP ||296inst->src[i].index != spill_temp) {297continue;298}299300c->cursor = vir_before_inst(inst);301302if (is_uniform) {303struct qreg unif =304vir_uniform(c,305c->uniform_contents[uniform_index],306c->uniform_data[uniform_index]);307inst->src[i] = unif;308} else {309/* If we have a postponed spill, we don't need310* a fill as the temp would not have been311* spilled yet.312*/313if (postponed_spill)314continue;315if (start_of_tmu_sequence)316c->cursor = vir_before_inst(start_of_tmu_sequence);317318v3d_emit_spill_tmua(c, spill_offset);319vir_emit_thrsw(c);320inst->src[i] = vir_LDTMU(c);321c->fills++;322}323}324325/* spills */326if (inst->dst.file == QFILE_TEMP &&327inst->dst.index == spill_temp) {328if (is_uniform) {329c->cursor.link = NULL;330vir_remove_instruction(c, inst);331} else {332if (start_of_tmu_sequence)333postponed_spill = inst;334else335v3d_emit_tmu_spill(c, inst, inst,336spill_offset);337}338}339340/* If we didn't have a last-thrsw inserted by nir_to_vir and341* we've been inserting thrsws, then insert a new last_thrsw342* right before we start the vpm/tlb sequence for the last343* thread segment.344*/345if (!is_uniform && !last_thrsw && c->last_thrsw &&346(v3d_qpu_writes_vpm(&inst->qpu) ||347v3d_qpu_uses_tlb(&inst->qpu))) {348c->cursor = vir_before_inst(inst);349vir_emit_thrsw(c);350351last_thrsw = c->last_thrsw;352last_thrsw->is_last_thrsw = true;353}354}355}356357/* Make sure c->last_thrsw is the actual last thrsw, not just one we358* inserted in our most recent unspill.359*/360if (last_thrsw)361c->last_thrsw = last_thrsw;362363/* Don't allow spilling of our spilling instructions. There's no way364* they can help get things colored.365*/366for (int i = start_num_temps; i < c->num_temps; i++)367BITSET_CLEAR(c->spillable, i);368369c->disable_ldunif_opt = had_disable_ldunif_opt;370}371372struct node_to_temp_map {373uint32_t temp;374uint32_t priority;375};376377struct v3d_ra_select_callback_data {378uint32_t next_acc;379uint32_t next_phys;380struct node_to_temp_map *map;381};382383/* Choosing accumulators improves chances of merging QPU instructions384* due to these merges requiring that at most 2 rf registers are used385* by the add and mul instructions.386*/387static bool388v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,389BITSET_WORD *regs,390int priority)391{392/* Favor accumulators if we have less that this number of physical393* registers. Accumulators have more restrictions (like being394* invalidated through thrsw), so running out of physical registers395* even if we have accumulators available can lead to register396* allocation failures.397*/398static const int available_rf_threshold = 5;399int available_rf = 0 ;400for (int i = 0; i < PHYS_COUNT; i++) {401if (BITSET_TEST(regs, PHYS_INDEX + i))402available_rf++;403if (available_rf >= available_rf_threshold)404break;405}406if (available_rf < available_rf_threshold)407return true;408409/* Favor accumulators for short-lived temps (our priority represents410* liveness), to prevent long-lived temps from grabbing accumulators411* and preventing follow-up instructions from using them, potentially412* leading to large portions of the shader being unable to use413* accumulators and therefore merge instructions successfully.414*/415static const int priority_threshold = 20;416if (priority <= priority_threshold)417return true;418419return false;420}421422static bool423v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,424BITSET_WORD *regs,425unsigned int *out)426{427/* Round-robin through our accumulators to give post-RA instruction428* selection more options.429*/430for (int i = 0; i < ACC_COUNT; i++) {431int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;432int acc = ACC_INDEX + acc_off;433434if (BITSET_TEST(regs, acc)) {435v3d_ra->next_acc = acc_off + 1;436*out = acc;437return true;438}439}440441return false;442}443444static bool445v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,446BITSET_WORD *regs,447unsigned int *out)448{449for (int i = 0; i < PHYS_COUNT; i++) {450int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;451int phys = PHYS_INDEX + phys_off;452453if (BITSET_TEST(regs, phys)) {454v3d_ra->next_phys = phys_off + 1;455*out = phys;456return true;457}458}459460return false;461}462463static unsigned int464v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)465{466struct v3d_ra_select_callback_data *v3d_ra = data;467int r5 = ACC_INDEX + 5;468469/* Choose r5 for our ldunifs if possible (nobody else can load to that470* reg, and it keeps the QPU cond field free from being occupied by471* ldunifrf).472*/473if (BITSET_TEST(regs, r5))474return r5;475476unsigned int reg;477if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&478v3d_ra_select_accum(v3d_ra, regs, ®)) {479return reg;480}481482if (v3d_ra_select_rf(v3d_ra, regs, ®))483return reg;484485/* If we ran out of physical registers try to assign an accumulator486* if we didn't favor that option earlier.487*/488if (v3d_ra_select_accum(v3d_ra, regs, ®))489return reg;490491unreachable("RA must pass us at least one possible reg.");492}493494bool495vir_init_reg_sets(struct v3d_compiler *compiler)496{497/* Allocate up to 3 regfile classes, for the ways the physical498* register file can be divided up for fragment shader threading.499*/500int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);501502compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,503false);504if (!compiler->regs)505return false;506507for (int threads = 0; threads < max_thread_index; threads++) {508compiler->reg_class_any[threads] =509ra_alloc_contig_reg_class(compiler->regs, 1);510compiler->reg_class_r5[threads] =511ra_alloc_contig_reg_class(compiler->regs, 1);512compiler->reg_class_phys_or_acc[threads] =513ra_alloc_contig_reg_class(compiler->regs, 1);514compiler->reg_class_phys[threads] =515ra_alloc_contig_reg_class(compiler->regs, 1);516517for (int i = PHYS_INDEX;518i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {519ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);520ra_class_add_reg(compiler->reg_class_phys[threads], i);521ra_class_add_reg(compiler->reg_class_any[threads], i);522}523524for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {525ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);526ra_class_add_reg(compiler->reg_class_any[threads], i);527}528/* r5 can only store a single 32-bit value, so not much can529* use it.530*/531ra_class_add_reg(compiler->reg_class_r5[threads],532ACC_INDEX + 5);533ra_class_add_reg(compiler->reg_class_any[threads],534ACC_INDEX + 5);535}536537ra_set_finalize(compiler->regs, NULL);538539return true;540}541542static int543node_to_temp_priority(const void *in_a, const void *in_b)544{545const struct node_to_temp_map *a = in_a;546const struct node_to_temp_map *b = in_b;547548return a->priority - b->priority;549}550551/**552* Computes the number of registers to spill in a batch after a register553* allocation failure.554*/555static uint32_t556get_spill_batch_size(struct v3d_compile *c)557{558/* Allow up to 10 spills in batches of 1 in any case to avoid any chance of559* over-spilling if the program requires few spills to compile.560*/561if (c->spill_count < 10)562return 1;563564/* If we have to spill more than that we assume performance is not going to565* be great and we shift focus to batching spills to cut down compile566* time at the expense of over-spilling.567*/568return 20;569}570571/* Don't emit spills using the TMU until we've dropped thread count first. We,572* may also disable spilling when certain optimizations that are known to573* increase register pressure are active so we favor recompiling with574* optimizations disabled instead of spilling.575*/576static inline bool577tmu_spilling_allowed(struct v3d_compile *c, int thread_index)578{579return thread_index == 0 && c->tmu_spilling_allowed;580}581582#define CLASS_BIT_PHYS (1 << 0)583#define CLASS_BIT_ACC (1 << 1)584#define CLASS_BIT_R5 (1 << 4)585#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \586CLASS_BIT_ACC | \587CLASS_BIT_R5)588589/**590* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.591*592* The return value should be freed by the caller.593*/594struct qpu_reg *595v3d_register_allocate(struct v3d_compile *c, bool *spilled)596{597uint32_t UNUSED start_num_temps = c->num_temps;598struct node_to_temp_map map[c->num_temps];599uint32_t temp_to_node[c->num_temps];600uint8_t class_bits[c->num_temps];601int acc_nodes[ACC_COUNT];602struct v3d_ra_select_callback_data callback_data = {603.next_acc = 0,604/* Start at RF3, to try to keep the TLB writes from using605* RF0-2.606*/607.next_phys = 3,608.map = map,609};610611*spilled = false;612613vir_calculate_live_intervals(c);614615/* Convert 1, 2, 4 threads to 0, 1, 2 index.616*617* V3D 4.x has double the physical register space, so 64 physical regs618* are available at both 1x and 2x threading, and 4x has 32.619*/620int thread_index = ffs(c->threads) - 1;621if (c->devinfo->ver >= 40) {622if (thread_index >= 1)623thread_index--;624}625626struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,627c->num_temps +628ARRAY_SIZE(acc_nodes));629ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);630631/* Make some fixed nodes for the accumulators, which we will need to632* interfere with when ops have implied r3/r4 writes or for the thread633* switches. We could represent these as classes for the nodes to634* live in, but the classes take up a lot of memory to set up, so we635* don't want to make too many.636*/637for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {638acc_nodes[i] = c->num_temps + i;639ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);640}641642for (uint32_t i = 0; i < c->num_temps; i++) {643map[i].temp = i;644map[i].priority = c->temp_end[i] - c->temp_start[i];645}646qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);647for (uint32_t i = 0; i < c->num_temps; i++) {648temp_to_node[map[i].temp] = i;649}650651/* Figure out our register classes and preallocated registers. We652* start with any temp being able to be in any file, then instructions653* incrementally remove bits that the temp definitely can't be in.654*/655memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));656657int ip = 0;658vir_for_each_inst_inorder(inst, c) {659/* If the instruction writes r3/r4 (and optionally moves its660* result to a temp), nothing else can be stored in r3/r4 across661* it.662*/663if (vir_writes_r3(c->devinfo, inst)) {664for (int i = 0; i < c->num_temps; i++) {665if (c->temp_start[i] < ip &&666c->temp_end[i] > ip) {667ra_add_node_interference(g,668temp_to_node[i],669acc_nodes[3]);670}671}672}673if (vir_writes_r4(c->devinfo, inst)) {674for (int i = 0; i < c->num_temps; i++) {675if (c->temp_start[i] < ip &&676c->temp_end[i] > ip) {677ra_add_node_interference(g,678temp_to_node[i],679acc_nodes[4]);680}681}682}683684if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {685switch (inst->qpu.alu.add.op) {686case V3D_QPU_A_LDVPMV_IN:687case V3D_QPU_A_LDVPMV_OUT:688case V3D_QPU_A_LDVPMD_IN:689case V3D_QPU_A_LDVPMD_OUT:690case V3D_QPU_A_LDVPMP:691case V3D_QPU_A_LDVPMG_IN:692case V3D_QPU_A_LDVPMG_OUT:693/* LDVPMs only store to temps (the MA flag694* decides whether the LDVPM is in or out)695*/696assert(inst->dst.file == QFILE_TEMP);697class_bits[inst->dst.index] &= CLASS_BIT_PHYS;698break;699700case V3D_QPU_A_RECIP:701case V3D_QPU_A_RSQRT:702case V3D_QPU_A_EXP:703case V3D_QPU_A_LOG:704case V3D_QPU_A_SIN:705case V3D_QPU_A_RSQRT2:706/* The SFU instructions write directly to the707* phys regfile.708*/709assert(inst->dst.file == QFILE_TEMP);710class_bits[inst->dst.index] &= CLASS_BIT_PHYS;711break;712713default:714break;715}716}717718if (inst->src[0].file == QFILE_REG) {719switch (inst->src[0].index) {720case 0:721case 1:722case 2:723case 3:724/* Payload setup instructions: Force allocate725* the dst to the given register (so the MOV726* will disappear).727*/728assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);729assert(inst->dst.file == QFILE_TEMP);730ra_set_node_reg(g,731temp_to_node[inst->dst.index],732PHYS_INDEX +733inst->src[0].index);734break;735}736}737738if (inst->dst.file == QFILE_TEMP) {739/* Only a ldunif gets to write to R5, which only has a740* single 32-bit channel of storage.741*/742if (!inst->qpu.sig.ldunif) {743class_bits[inst->dst.index] &= ~CLASS_BIT_R5;744} else {745/* Until V3D 4.x, we could only load a uniform746* to r5, so we'll need to spill if uniform747* loads interfere with each other.748*/749if (c->devinfo->ver < 40) {750class_bits[inst->dst.index] &=751CLASS_BIT_R5;752}753}754}755756if (inst->qpu.sig.thrsw) {757/* All accumulators are invalidated across a thread758* switch.759*/760for (int i = 0; i < c->num_temps; i++) {761if (c->temp_start[i] < ip && c->temp_end[i] > ip)762class_bits[i] &= CLASS_BIT_PHYS;763}764}765766ip++;767}768769for (uint32_t i = 0; i < c->num_temps; i++) {770if (class_bits[i] == CLASS_BIT_PHYS) {771ra_set_node_class(g, temp_to_node[i],772c->compiler->reg_class_phys[thread_index]);773} else if (class_bits[i] == (CLASS_BIT_R5)) {774ra_set_node_class(g, temp_to_node[i],775c->compiler->reg_class_r5[thread_index]);776} else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {777ra_set_node_class(g, temp_to_node[i],778c->compiler->reg_class_phys_or_acc[thread_index]);779} else {780assert(class_bits[i] == CLASS_BITS_ANY);781ra_set_node_class(g, temp_to_node[i],782c->compiler->reg_class_any[thread_index]);783}784}785786for (uint32_t i = 0; i < c->num_temps; i++) {787for (uint32_t j = i + 1; j < c->num_temps; j++) {788if (!(c->temp_start[i] >= c->temp_end[j] ||789c->temp_start[j] >= c->temp_end[i])) {790ra_add_node_interference(g,791temp_to_node[i],792temp_to_node[j]);793}794}795}796797/* Debug code to force a bit of register spilling, for running across798* conformance tests to make sure that spilling works.799*/800int force_register_spills = 0;801if (c->spill_size <802V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {803int node = v3d_choose_spill_node(c, g, temp_to_node);804if (node != -1) {805v3d_spill_reg(c, map[node].temp);806ralloc_free(g);807*spilled = true;808return NULL;809}810}811812bool ok = ra_allocate(g);813if (!ok) {814const uint32_t spill_batch_size = get_spill_batch_size(c);815816for (uint32_t i = 0; i < spill_batch_size; i++) {817int node = v3d_choose_spill_node(c, g, temp_to_node);818if (node == -1)819break;820821/* TMU spills inject thrsw signals that invalidate822* accumulators, so we can't batch them.823*/824bool is_uniform = vir_is_mov_uniform(c, map[node].temp);825if (i > 0 && !is_uniform)826break;827828if (is_uniform || tmu_spilling_allowed(c, thread_index)) {829v3d_spill_reg(c, map[node].temp);830831/* Ask the outer loop to call back in. */832*spilled = true;833834/* See comment above about batching TMU spills.835*/836if (!is_uniform) {837assert(i == 0);838break;839}840} else {841break;842}843}844845ralloc_free(g);846return NULL;847}848849/* Ensure that we are not accessing temp_to_node out of bounds. We850* should never trigger this assertion because `c->num_temps` only851* grows when we spill, in which case we return early and don't get852* here.853*/854assert(start_num_temps == c->num_temps);855struct qpu_reg *temp_registers = calloc(c->num_temps,856sizeof(*temp_registers));857858for (uint32_t i = 0; i < c->num_temps; i++) {859int ra_reg = ra_get_node_reg(g, temp_to_node[i]);860if (ra_reg < PHYS_INDEX) {861temp_registers[i].magic = true;862temp_registers[i].index = (V3D_QPU_WADDR_R0 +863ra_reg - ACC_INDEX);864} else {865temp_registers[i].magic = false;866temp_registers[i].index = ra_reg - PHYS_INDEX;867}868}869870ralloc_free(g);871872return temp_registers;873}874875876