Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_register_allocate.c
8268 views
/*1* Copyright © 2014 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "util/ralloc.h"24#include "util/register_allocate.h"25#include "vc4_context.h"26#include "vc4_qir.h"27#include "vc4_qpu.h"2829#define QPU_R(file, index) { QPU_MUX_##file, index }3031static const struct qpu_reg vc4_regs[] = {32{ QPU_MUX_R0, 0},33{ QPU_MUX_R1, 0},34{ QPU_MUX_R2, 0},35{ QPU_MUX_R3, 0},36{ QPU_MUX_R4, 0},37QPU_R(A, 0),38QPU_R(B, 0),39QPU_R(A, 1),40QPU_R(B, 1),41QPU_R(A, 2),42QPU_R(B, 2),43QPU_R(A, 3),44QPU_R(B, 3),45QPU_R(A, 4),46QPU_R(B, 4),47QPU_R(A, 5),48QPU_R(B, 5),49QPU_R(A, 6),50QPU_R(B, 6),51QPU_R(A, 7),52QPU_R(B, 7),53QPU_R(A, 8),54QPU_R(B, 8),55QPU_R(A, 9),56QPU_R(B, 9),57QPU_R(A, 10),58QPU_R(B, 10),59QPU_R(A, 11),60QPU_R(B, 11),61QPU_R(A, 12),62QPU_R(B, 12),63QPU_R(A, 13),64QPU_R(B, 13),65QPU_R(A, 14),66QPU_R(B, 14),67QPU_R(A, 15),68QPU_R(B, 15),69QPU_R(A, 16),70QPU_R(B, 16),71QPU_R(A, 17),72QPU_R(B, 17),73QPU_R(A, 18),74QPU_R(B, 18),75QPU_R(A, 19),76QPU_R(B, 19),77QPU_R(A, 20),78QPU_R(B, 20),79QPU_R(A, 21),80QPU_R(B, 21),81QPU_R(A, 22),82QPU_R(B, 22),83QPU_R(A, 23),84QPU_R(B, 23),85QPU_R(A, 24),86QPU_R(B, 24),87QPU_R(A, 25),88QPU_R(B, 25),89QPU_R(A, 26),90QPU_R(B, 26),91QPU_R(A, 27),92QPU_R(B, 27),93QPU_R(A, 28),94QPU_R(B, 28),95QPU_R(A, 29),96QPU_R(B, 29),97QPU_R(A, 30),98QPU_R(B, 30),99QPU_R(A, 31),100QPU_R(B, 31),101};102#define ACC_INDEX 0103#define ACC_COUNT 5104#define AB_INDEX (ACC_INDEX + ACC_COUNT)105#define AB_COUNT 64106107static void108vc4_alloc_reg_set(struct vc4_context *vc4)109{110assert(vc4_regs[AB_INDEX].addr == 0);111assert(vc4_regs[AB_INDEX + 1].addr == 0);112STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64);113114if (vc4->regs)115return;116117vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), false);118119/* The physical regfiles split us into two classes, with [0] being the120* whole space and [1] being the bottom half (for threaded fragment121* shaders).122*/123for (int i = 0; i < 2; i++) {124vc4->reg_class_any[i] = ra_alloc_contig_reg_class(vc4->regs, 1);125vc4->reg_class_a_or_b[i] = ra_alloc_contig_reg_class(vc4->regs, 1);126vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_contig_reg_class(vc4->regs, 1);127vc4->reg_class_r4_or_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1);128vc4->reg_class_a[i] = ra_alloc_contig_reg_class(vc4->regs, 1);129}130vc4->reg_class_r0_r3 = ra_alloc_contig_reg_class(vc4->regs, 1);131132/* r0-r3 */133for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {134ra_class_add_reg(vc4->reg_class_r0_r3, i);135ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i);136ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i);137}138139/* R4 gets a special class because it can't be written as a general140* purpose register. (it's TMU_NOSWAP as a write address).141*/142for (int i = 0; i < 2; i++) {143ra_class_add_reg(vc4->reg_class_r4_or_a[i], ACC_INDEX + 4);144ra_class_add_reg(vc4->reg_class_any[i], ACC_INDEX + 4);145}146147/* A/B */148for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {149/* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in150* vc4_qpu_emit.c151*/152if (vc4_regs[i].addr == 14)153continue;154155ra_class_add_reg(vc4->reg_class_any[0], i);156ra_class_add_reg(vc4->reg_class_a_or_b[0], i);157ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[0], i);158159if (vc4_regs[i].addr < 16) {160ra_class_add_reg(vc4->reg_class_any[1], i);161ra_class_add_reg(vc4->reg_class_a_or_b[1], i);162ra_class_add_reg(vc4->reg_class_a_or_b_or_acc[1], i);163}164165166/* A only */167if (((i - AB_INDEX) & 1) == 0) {168ra_class_add_reg(vc4->reg_class_a[0], i);169ra_class_add_reg(vc4->reg_class_r4_or_a[0], i);170171if (vc4_regs[i].addr < 16) {172ra_class_add_reg(vc4->reg_class_a[1], i);173ra_class_add_reg(vc4->reg_class_r4_or_a[1], i);174}175}176}177178ra_set_finalize(vc4->regs, NULL);179}180181struct node_to_temp_map {182uint32_t temp;183uint32_t priority;184};185186static int187node_to_temp_priority(const void *in_a, const void *in_b)188{189const struct node_to_temp_map *a = in_a;190const struct node_to_temp_map *b = in_b;191192return a->priority - b->priority;193}194195#define CLASS_BIT_A (1 << 0)196#define CLASS_BIT_B (1 << 1)197#define CLASS_BIT_R4 (1 << 2)198#define CLASS_BIT_R0_R3 (1 << 4)199200struct vc4_ra_select_callback_data {201uint32_t next_acc;202uint32_t next_ab;203};204205static unsigned int206vc4_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)207{208struct vc4_ra_select_callback_data *vc4_ra = data;209210/* If r4 is available, always choose it -- few other things can go211* there, and choosing anything else means inserting a mov.212*/213if (BITSET_TEST(regs, ACC_INDEX + 4))214return ACC_INDEX + 4;215216/* Choose an accumulator if possible (no delay between write and217* read), but round-robin through them to give post-RA instruction218* selection more options.219*/220for (int i = 0; i < ACC_COUNT; i++) {221int acc_off = (vc4_ra->next_acc + i) % ACC_COUNT;222int acc = ACC_INDEX + acc_off;223224if (BITSET_TEST(regs, acc)) {225vc4_ra->next_acc = acc_off + 1;226return acc;227}228}229230for (int i = 0; i < AB_COUNT; i++) {231int ab_off = (vc4_ra->next_ab + i) % AB_COUNT;232int ab = AB_INDEX + ab_off;233234if (BITSET_TEST(regs, ab)) {235vc4_ra->next_ab = ab_off + 1;236return ab;237}238}239240unreachable("RA must pass us at least one possible reg.");241}242243/**244* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.245*246* The return value should be freed by the caller.247*/248struct qpu_reg *249vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)250{251struct node_to_temp_map map[c->num_temps];252uint32_t temp_to_node[c->num_temps];253uint8_t class_bits[c->num_temps];254struct qpu_reg *temp_registers = calloc(c->num_temps,255sizeof(*temp_registers));256struct vc4_ra_select_callback_data callback_data = {257.next_acc = 0,258.next_ab = 0,259};260261/* If things aren't ever written (undefined values), just read from262* r0.263*/264for (uint32_t i = 0; i < c->num_temps; i++)265temp_registers[i] = qpu_rn(0);266267vc4_alloc_reg_set(vc4);268269struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,270c->num_temps);271272/* Compute the live ranges so we can figure out interference. */273qir_calculate_live_intervals(c);274275ra_set_select_reg_callback(g, vc4_ra_select_callback, &callback_data);276277for (uint32_t i = 0; i < c->num_temps; i++) {278map[i].temp = i;279map[i].priority = c->temp_end[i] - c->temp_start[i];280}281qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);282for (uint32_t i = 0; i < c->num_temps; i++) {283temp_to_node[map[i].temp] = i;284}285286/* Figure out our register classes and preallocated registers. We287* start with any temp being able to be in any file, then instructions288* incrementally remove bits that the temp definitely can't be in.289*/290memset(class_bits,291CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3,292sizeof(class_bits));293294int ip = 0;295qir_for_each_inst_inorder(inst, c) {296if (qir_writes_r4(inst)) {297/* This instruction writes r4 (and optionally moves298* its result to a temp), so nothing else can be299* stored in r4 across it.300*/301for (int i = 0; i < c->num_temps; i++) {302if (c->temp_start[i] < ip && c->temp_end[i] > ip)303class_bits[i] &= ~CLASS_BIT_R4;304}305306/* If we're doing a conditional write of something307* writing R4 (math, tex results), then make sure that308* we store in a temp so that we actually309* conditionally move the result.310*/311if (inst->cond != QPU_COND_ALWAYS)312class_bits[inst->dst.index] &= ~CLASS_BIT_R4;313} else {314/* R4 can't be written as a general purpose315* register. (it's TMU_NOSWAP as a write address).316*/317if (inst->dst.file == QFILE_TEMP)318class_bits[inst->dst.index] &= ~CLASS_BIT_R4;319}320321switch (inst->op) {322case QOP_FRAG_Z:323ra_set_node_reg(g, temp_to_node[inst->dst.index],324AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);325break;326327case QOP_FRAG_W:328ra_set_node_reg(g, temp_to_node[inst->dst.index],329AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);330break;331332case QOP_ROT_MUL:333assert(inst->src[0].file == QFILE_TEMP);334class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;335break;336337case QOP_THRSW:338/* All accumulators are invalidated across a thread339* switch.340*/341for (int i = 0; i < c->num_temps; i++) {342if (c->temp_start[i] < ip && c->temp_end[i] > ip)343class_bits[i] &= ~(CLASS_BIT_R0_R3 |344CLASS_BIT_R4);345}346break;347348default:349break;350}351352if (inst->dst.pack && !qir_is_mul(inst)) {353/* The non-MUL pack flags require an A-file dst354* register.355*/356class_bits[inst->dst.index] &= CLASS_BIT_A;357}358359/* Apply restrictions for src unpacks. The integer unpacks360* can only be done from regfile A, while float unpacks can be361* either A or R4.362*/363for (int i = 0; i < qir_get_nsrc(inst); i++) {364if (inst->src[i].file == QFILE_TEMP &&365inst->src[i].pack) {366if (qir_is_float_input(inst)) {367class_bits[inst->src[i].index] &=368CLASS_BIT_A | CLASS_BIT_R4;369} else {370class_bits[inst->src[i].index] &=371CLASS_BIT_A;372}373}374}375376ip++;377}378379for (uint32_t i = 0; i < c->num_temps; i++) {380int node = temp_to_node[i];381382switch (class_bits[i]) {383case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:384ra_set_node_class(g, node,385vc4->reg_class_any[c->fs_threaded]);386break;387case CLASS_BIT_A | CLASS_BIT_B:388ra_set_node_class(g, node,389vc4->reg_class_a_or_b[c->fs_threaded]);390break;391case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:392ra_set_node_class(g, node,393vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);394break;395case CLASS_BIT_A | CLASS_BIT_R4:396ra_set_node_class(g, node,397vc4->reg_class_r4_or_a[c->fs_threaded]);398break;399case CLASS_BIT_A:400ra_set_node_class(g, node,401vc4->reg_class_a[c->fs_threaded]);402break;403case CLASS_BIT_R0_R3:404ra_set_node_class(g, node, vc4->reg_class_r0_r3);405break;406407default:408/* DDX/DDY used across thread switched might get us409* here.410*/411if (c->fs_threaded) {412c->failed = true;413free(temp_registers);414return NULL;415}416417fprintf(stderr, "temp %d: bad class bits: 0x%x\n",418i, class_bits[i]);419abort();420break;421}422}423424for (uint32_t i = 0; i < c->num_temps; i++) {425for (uint32_t j = i + 1; j < c->num_temps; j++) {426if (!(c->temp_start[i] >= c->temp_end[j] ||427c->temp_start[j] >= c->temp_end[i])) {428ra_add_node_interference(g,429temp_to_node[i],430temp_to_node[j]);431}432}433}434435bool ok = ra_allocate(g);436if (!ok) {437if (!c->fs_threaded) {438fprintf(stderr, "Failed to register allocate:\n");439qir_dump(c);440}441442c->failed = true;443free(temp_registers);444return NULL;445}446447for (uint32_t i = 0; i < c->num_temps; i++) {448temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];449450/* If the value's never used, just write to the NOP register451* for clarity in debug output.452*/453if (c->temp_start[i] == c->temp_end[i])454temp_registers[i] = qpu_ra(QPU_W_NOP);455}456457ralloc_free(g);458459return temp_registers;460}461462463