Path: blob/21.2-virgl/src/gallium/drivers/lima/ir/pp/regalloc.c
4574 views
/*1* Copyright (c) 2017 Lima Project2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sub license,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the11* next paragraph) shall be included in all copies or substantial portions12* of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER20* DEALINGS IN THE SOFTWARE.21*22*/2324#include "util/ralloc.h"25#include "util/register_allocate.h"26#include "util/u_debug.h"2728#include "ppir.h"29#include "lima_context.h"3031#define PPIR_REG_COUNT (6 * 4)3233enum ppir_ra_reg_class {34ppir_ra_reg_class_vec1,35ppir_ra_reg_class_vec2,36ppir_ra_reg_class_vec3,37ppir_ra_reg_class_vec4,3839/* 4 reg class for load/store instr regs:40* load/store instr has no swizzle field, so the (virtual) register41* must be allocated at the beginning of a (physical) register,42*/43ppir_ra_reg_class_head_vec1,44ppir_ra_reg_class_head_vec2,45ppir_ra_reg_class_head_vec3,46ppir_ra_reg_class_head_vec4,4748ppir_ra_reg_class_num,49};5051struct ra_regs *ppir_regalloc_init(void *mem_ctx)52{53struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);54if (!ret)55return NULL;5657/* Classes for contiguous 1-4 channel groups anywhere within a register. */58struct ra_class *classes[ppir_ra_reg_class_num];59for (int i = 0; i < ppir_ra_reg_class_head_vec1; i++) {60classes[i] = ra_alloc_contig_reg_class(ret, i + 1);6162for (int j = 0; j < PPIR_REG_COUNT; j += 4) {63for (int swiz = 0; swiz < (4 - i); swiz++)64ra_class_add_reg(classes[i], j + swiz);65}66}6768/* Classes for contiguous 1-4 channels with a start channel of .x */69for (int i = ppir_ra_reg_class_head_vec1; i < ppir_ra_reg_class_num; i++) {70classes[i] = ra_alloc_contig_reg_class(ret, i - ppir_ra_reg_class_head_vec1 + 1);7172for (int j = 0; j < PPIR_REG_COUNT; j += 4)73ra_class_add_reg(classes[i], j);74}7576ra_set_finalize(ret, NULL);77return ret;78}7980static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)81{82list_for_each_entry(ppir_block, block, &comp->block_list, list) {83list_for_each_entry(ppir_node, node, &block->node_list, list) {84if (node->is_end)85continue;8687if (!node->instr || node->op == ppir_op_const)88continue;8990ppir_dest *dest = ppir_node_get_dest(node);91if (dest) {92ppir_reg *reg = NULL;9394if (dest->type == ppir_target_ssa) {95reg = &dest->ssa;96list_addtail(®->list, &comp->reg_list);97comp->reg_num++;98}99}100}101}102}103104static void ppir_regalloc_print_result(ppir_compiler *comp)105{106printf("======ppir regalloc result======\n");107list_for_each_entry(ppir_block, block, &comp->block_list, list) {108list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {109printf("%03d:", instr->index);110for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {111ppir_node *node = instr->slots[i];112if (!node)113continue;114115printf(" (%d|", node->index);116117ppir_dest *dest = ppir_node_get_dest(node);118if (dest)119printf("%d", ppir_target_get_dest_reg_index(dest));120121printf("|");122123for (int i = 0; i < ppir_node_get_src_num(node); i++) {124if (i)125printf(" ");126printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));127}128129printf(")");130}131printf("\n");132}133}134printf("--------------------------\n");135}136137static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,138ppir_node *node)139{140ppir_instr *newinstr = ppir_instr_create(block);141if (unlikely(!newinstr))142return false;143144list_del(&newinstr->list);145list_add(&newinstr->list, &ref->list);146147if (!ppir_instr_insert_node(newinstr, node))148return false;149150list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {151instr->seq++;152}153newinstr->seq = ref->seq+1;154newinstr->scheduled = true;155return true;156}157158static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,159ppir_node *node)160{161ppir_instr *newinstr = ppir_instr_create(block);162if (unlikely(!newinstr))163return false;164165list_del(&newinstr->list);166list_addtail(&newinstr->list, &ref->list);167168if (!ppir_instr_insert_node(newinstr, node))169return false;170171list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {172instr->seq++;173}174newinstr->seq = ref->seq-1;175newinstr->scheduled = true;176return true;177}178179static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block,180ppir_node *node, ppir_src *src,181ppir_node **fill_node)182{183/* nodes might have multiple references to the same value.184* avoid creating unnecessary loads for the same fill by185* saving the node resulting from the temporary load */186if (*fill_node)187goto update_src;188189int num_components = src->reg->num_components;190191/* alloc new node to load value */192ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);193if (!load_node)194return false;195list_addtail(&load_node->list, &node->list);196comp->num_fills++;197198ppir_load_node *load = ppir_node_to_load(load_node);199200load->index = -comp->prog->state.stack_size; /* index sizes are negative */201load->num_components = num_components;202203ppir_dest *ld_dest = &load->dest;204ld_dest->type = ppir_target_pipeline;205ld_dest->pipeline = ppir_pipeline_reg_uniform;206ld_dest->write_mask = u_bit_consecutive(0, num_components);207208/* If the uniform slot is empty, we can insert the load_temp209* there and use it directly. Exceptionally, if the node is in the210* varying or texld slot, this doesn't work. */211if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] &&212node->instr_pos != PPIR_INSTR_SLOT_VARYING &&213node->instr_pos != PPIR_INSTR_SLOT_TEXLD) {214ppir_node_target_assign(src, load_node);215*fill_node = load_node;216return ppir_instr_insert_node(node->instr, load_node);217}218219/* Uniform slot was taken, so fall back to a new instruction with a mov */220if (!create_new_instr_before(block, node->instr, load_node))221return false;222223/* Create move node */224ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);225if (unlikely(!move_node))226return false;227list_addtail(&move_node->list, &node->list);228229ppir_alu_node *move_alu = ppir_node_to_alu(move_node);230231move_alu->num_src = 1;232move_alu->src->type = ppir_target_pipeline;233move_alu->src->pipeline = ppir_pipeline_reg_uniform;234for (int i = 0; i < 4; i++)235move_alu->src->swizzle[i] = i;236237ppir_dest *alu_dest = &move_alu->dest;238alu_dest->type = ppir_target_ssa;239alu_dest->ssa.num_components = num_components;240alu_dest->ssa.spilled = true;241alu_dest->write_mask = u_bit_consecutive(0, num_components);242243list_addtail(&alu_dest->ssa.list, &comp->reg_list);244comp->reg_num++;245246if (!ppir_instr_insert_node(load_node->instr, move_node))247return false;248249/* insert the new node as predecessor */250ppir_node_foreach_pred_safe(node, dep) {251ppir_node *pred = dep->pred;252ppir_node_remove_dep(dep);253ppir_node_add_dep(load_node, pred, ppir_dep_src);254}255ppir_node_add_dep(node, move_node, ppir_dep_src);256ppir_node_add_dep(move_node, load_node, ppir_dep_src);257258*fill_node = move_node;259260update_src:261/* switch node src to use the fill node dest */262ppir_node_target_assign(src, *fill_node);263264return true;265}266267static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block,268ppir_node *node)269{270ppir_dest *dest = ppir_node_get_dest(node);271assert(dest != NULL);272assert(dest->type == ppir_target_register);273ppir_reg *reg = dest->reg;274int num_components = reg->num_components;275276/* alloc new node to load value */277ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);278if (!load_node)279return NULL;280list_addtail(&load_node->list, &node->list);281comp->num_fills++;282283ppir_load_node *load = ppir_node_to_load(load_node);284285load->index = -comp->prog->state.stack_size; /* index sizes are negative */286load->num_components = num_components;287288load->dest.type = ppir_target_pipeline;289load->dest.pipeline = ppir_pipeline_reg_uniform;290load->dest.write_mask = u_bit_consecutive(0, num_components);291292/* New instruction is needed since we're updating a dest register293* and we can't write to the uniform pipeline reg */294if (!create_new_instr_before(block, node->instr, load_node))295return false;296297/* Create move node */298ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);299if (unlikely(!move_node))300return false;301list_addtail(&move_node->list, &node->list);302303ppir_alu_node *move_alu = ppir_node_to_alu(move_node);304305move_alu->num_src = 1;306move_alu->src->type = ppir_target_pipeline;307move_alu->src->pipeline = ppir_pipeline_reg_uniform;308for (int i = 0; i < 4; i++)309move_alu->src->swizzle[i] = i;310311move_alu->dest.type = ppir_target_register;312move_alu->dest.reg = reg;313move_alu->dest.write_mask = u_bit_consecutive(0, num_components);314315if (!ppir_instr_insert_node(load_node->instr, move_node))316return false;317318ppir_node_foreach_pred_safe(node, dep) {319ppir_node *pred = dep->pred;320ppir_node_remove_dep(dep);321ppir_node_add_dep(load_node, pred, ppir_dep_src);322}323ppir_node_add_dep(node, move_node, ppir_dep_src);324ppir_node_add_dep(move_node, load_node, ppir_dep_src);325326return true;327}328329static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,330ppir_node *node)331{332ppir_dest *dest = ppir_node_get_dest(node);333assert(dest != NULL);334ppir_reg *reg = ppir_dest_get_reg(dest);335336/* alloc new node to store value */337ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);338if (!store_node)339return false;340list_addtail(&store_node->list, &node->list);341comp->num_spills++;342343ppir_store_node *store = ppir_node_to_store(store_node);344345store->index = -comp->prog->state.stack_size; /* index sizes are negative */346347ppir_node_target_assign(&store->src, node);348store->num_components = reg->num_components;349350/* insert the new node as successor */351ppir_node_foreach_succ_safe(node, dep) {352ppir_node *succ = dep->succ;353ppir_node_remove_dep(dep);354ppir_node_add_dep(succ, store_node, ppir_dep_src);355}356ppir_node_add_dep(store_node, node, ppir_dep_src);357358/* If the store temp slot is empty, we can insert the store_temp359* there and use it directly. Exceptionally, if the node is in the360* combine slot, this doesn't work. */361if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] &&362node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE)363return ppir_instr_insert_node(node->instr, store_node);364365/* Not possible to merge store, so fall back to a new instruction */366return create_new_instr_after(block, node->instr, store_node);367}368369static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)370{371list_for_each_entry(ppir_block, block, &comp->block_list, list) {372list_for_each_entry(ppir_node, node, &block->node_list, list) {373374ppir_dest *dest = ppir_node_get_dest(node);375if (dest && ppir_dest_get_reg(dest) == chosen) {376/* If dest is a register, it might be updating only some its377* components, so need to load the existing value first */378if (dest->type == ppir_target_register) {379if (!ppir_update_spilled_dest_load(comp, block, node))380return false;381}382if (!ppir_update_spilled_dest(comp, block, node))383return false;384}385386ppir_node *fill_node = NULL;387/* nodes might have multiple references to the same value.388* avoid creating unnecessary loads for the same fill by389* saving the node resulting from the temporary load */390for (int i = 0; i < ppir_node_get_src_num(node); i++) {391ppir_src *src = ppir_node_get_src(node, i);392ppir_reg *reg = ppir_src_get_reg(src);393if (reg == chosen) {394if (!ppir_update_spilled_src(comp, block, node, src, &fill_node))395return false;396}397}398}399}400401return true;402}403404static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,405struct ra_graph *g)406{407float spill_costs[comp->reg_num];408/* experimentally determined, it seems to be worth scaling cost of409* regs in instructions that have used uniform/store_temp slots,410* but not too much as to offset the num_components base cost. */411const float slot_scale = 1.1f;412413list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {414if (reg->spilled) {415/* not considered for spilling */416spill_costs[reg->regalloc_index] = 0.0f;417continue;418}419420/* It is beneficial to spill registers with higher component number,421* so increase the cost of spilling registers with few components */422float spill_cost = 4.0f / (float)reg->num_components;423spill_costs[reg->regalloc_index] = spill_cost;424}425426list_for_each_entry(ppir_block, block, &comp->block_list, list) {427list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {428if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) {429for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {430ppir_node *node = instr->slots[i];431if (!node)432continue;433for (int j = 0; j < ppir_node_get_src_num(node); j++) {434ppir_src *src = ppir_node_get_src(node, j);435if (!src)436continue;437ppir_reg *reg = ppir_src_get_reg(src);438if (!reg)439continue;440441spill_costs[reg->regalloc_index] *= slot_scale;442}443}444}445if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) {446for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {447ppir_node *node = instr->slots[i];448if (!node)449continue;450ppir_dest *dest = ppir_node_get_dest(node);451if (!dest)452continue;453ppir_reg *reg = ppir_dest_get_reg(dest);454if (!reg)455continue;456457spill_costs[reg->regalloc_index] *= slot_scale;458}459}460}461}462463for (int i = 0; i < comp->reg_num; i++)464ra_set_node_spill_cost(g, i, spill_costs[i]);465466int r = ra_get_best_spill_node(g);467if (r == -1)468return NULL;469470ppir_reg *chosen = NULL;471int i = 0;472list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {473if (i++ == r) {474chosen = reg;475break;476}477}478assert(chosen);479chosen->spilled = true;480chosen->is_head = true; /* store_temp unable to do swizzle */481482return chosen;483}484485static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)486{487int idx = 0;488489list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {490reg->regalloc_index = idx++;491}492493list_for_each_entry(ppir_block, block, &comp->block_list, list) {494list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {495496if (instr->live_mask)497ralloc_free(instr->live_mask);498instr->live_mask = rzalloc_array(comp, uint8_t,499reg_mask_size(comp->reg_num));500501if (instr->live_set)502ralloc_free(instr->live_set);503instr->live_set = rzalloc_array(comp, BITSET_WORD, comp->reg_num);504505if (instr->live_internal)506ralloc_free(instr->live_internal);507instr->live_internal = rzalloc_array(comp, BITSET_WORD, comp->reg_num);508}509}510}511512static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g,513BITSET_WORD *liveness)514{515int i, j;516BITSET_FOREACH_SET(i, liveness, comp->reg_num) {517BITSET_FOREACH_SET(j, liveness, comp->reg_num) {518ra_add_node_interference(g, i, j);519}520BITSET_CLEAR(liveness, i);521}522}523524int lima_ppir_force_spilling = 0;525526static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)527{528ppir_regalloc_reset_liveness_info(comp);529530struct ra_graph *g = ra_alloc_interference_graph(531comp->ra, comp->reg_num);532533int n = 0;534list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {535int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);536if (reg->is_head)537c += 4;538ra_set_node_class(g, n++, ra_get_class_from_index(comp->ra, c));539}540541ppir_liveness_analysis(comp);542543list_for_each_entry(ppir_block, block, &comp->block_list, list) {544list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {545int i;546BITSET_FOREACH_SET(i, instr->live_internal, comp->reg_num) {547BITSET_SET(instr->live_set, i);548}549ppir_all_interference(comp, g, instr->live_set);550}551}552553*spilled = false;554bool ok = ra_allocate(g);555if (!ok || (comp->force_spilling-- > 0)) {556ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);557if (chosen) {558/* stack_size will be used to assemble the frame reg in lima_draw.559* It is also be used in the spilling code, as negative indices560* starting from -1, to create stack addresses. */561comp->prog->state.stack_size++;562if (!ppir_regalloc_spill_reg(comp, chosen))563goto err_out;564/* Ask the outer loop to call back in. */565*spilled = true;566567ppir_debug("spilled register %d/%d, num_components: %d\n",568chosen->regalloc_index, comp->reg_num,569chosen->num_components);570goto err_out;571}572573ppir_error("regalloc fail\n");574goto err_out;575}576577n = 0;578list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {579reg->index = ra_get_node_reg(g, n++);580}581582ralloc_free(g);583584if (lima_debug & LIMA_DEBUG_PP)585ppir_regalloc_print_result(comp);586587return true;588589err_out:590ralloc_free(g);591return false;592}593594bool ppir_regalloc_prog(ppir_compiler *comp)595{596bool spilled = false;597comp->prog->state.stack_size = 0;598599/* Set from an environment variable to force spilling600* for debugging purposes, see lima_screen.c */601comp->force_spilling = lima_ppir_force_spilling;602603ppir_regalloc_update_reglist_ssa(comp);604605/* No registers? Probably shader consists of discard instruction */606if (list_is_empty(&comp->reg_list))607return true;608609/* this will most likely succeed in the first610* try, except for very complicated shaders */611while (!ppir_regalloc_prog_try(comp, &spilled))612if (!spilled)613return false;614615return true;616}617618619