Path: blob/21.2-virgl/src/panfrost/midgard/midgard_ra.c
4564 views
/*1* Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>2* Copyright (C) 2019 Collabora, Ltd.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,20* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "compiler.h"25#include "midgard_ops.h"26#include "util/u_math.h"27#include "util/u_memory.h"28#include "midgard_quirks.h"2930struct phys_reg {31/* Physical register: 0-31 */32unsigned reg;3334/* Byte offset into the physical register: 0-15 */35unsigned offset;3637/* log2(bytes per component) for fast mul/div */38unsigned shift;39};4041/* Shift up by reg_offset and horizontally by dst_offset. */4243static void44offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift, unsigned dstshift, unsigned dst_offset)45{46unsigned out[MIR_VEC_COMPONENTS];4748signed reg_comp = reg_offset >> srcshift;49signed dst_comp = dst_offset >> dstshift;5051unsigned max_component = (16 >> srcshift) - 1;5253assert(reg_comp << srcshift == reg_offset);54assert(dst_comp << dstshift == dst_offset);5556for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) {57signed comp = MAX2(c - dst_comp, 0);58out[c] = MIN2(swizzle[comp] + reg_comp, max_component);59}6061memcpy(swizzle, out, sizeof(out));62}6364/* Helper to return the default phys_reg for a given register */6566static struct phys_reg67default_phys_reg(int reg, unsigned shift)68{69struct phys_reg r = {70.reg = reg,71.offset = 0,72.shift = shift73};7475return r;76}7778/* Determine which physical register, swizzle, and mask a virtual79* register corresponds to */8081static struct phys_reg82index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, unsigned shift)83{84/* Check for special cases */85if (reg == ~0)86return default_phys_reg(REGISTER_UNUSED, shift);87else if (reg >= SSA_FIXED_MINIMUM)88return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift);89else if (!l)90return default_phys_reg(REGISTER_UNUSED, shift);9192struct phys_reg r = {93.reg = l->solutions[reg] / 16,94.offset = l->solutions[reg] & 0xF,95.shift = shift96};9798/* Report that we actually use this register, and return it */99100if (r.reg < 16)101ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1);102103return r;104}105106static void107set_class(unsigned *classes, unsigned node, unsigned class)108{109if (node < SSA_FIXED_MINIMUM && class != classes[node]) {110assert(classes[node] == REG_CLASS_WORK);111classes[node] = class;112}113}114115/* Special register classes impose special constraints on who can read their116* values, so check that */117118static bool ASSERTED119check_read_class(unsigned *classes, unsigned tag, unsigned node)120{121/* Non-nodes are implicitly ok */122if (node >= SSA_FIXED_MINIMUM)123return true;124125switch (classes[node]) {126case REG_CLASS_LDST:127return (tag == TAG_LOAD_STORE_4);128case REG_CLASS_TEXR:129return (tag == TAG_TEXTURE_4);130case REG_CLASS_TEXW:131return (tag != TAG_LOAD_STORE_4);132case REG_CLASS_WORK:133return IS_ALU(tag);134default:135unreachable("Invalid class");136}137}138139static bool ASSERTED140check_write_class(unsigned *classes, unsigned tag, unsigned node)141{142/* Non-nodes are implicitly ok */143if (node >= SSA_FIXED_MINIMUM)144return true;145146switch (classes[node]) {147case REG_CLASS_TEXR:148return true;149case REG_CLASS_TEXW:150return (tag == TAG_TEXTURE_4);151case REG_CLASS_LDST:152case REG_CLASS_WORK:153return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4);154default:155unreachable("Invalid class");156}157}158159/* Prepass before RA to ensure special class restrictions are met. The idea is160* to create a bit field of types of instructions that read a particular index.161* Later, we'll add moves as appropriate and rewrite to specialize by type. */162163static void164mark_node_class (unsigned *bitfield, unsigned node)165{166if (node < SSA_FIXED_MINIMUM)167BITSET_SET(bitfield, node);168}169170void171mir_lower_special_reads(compiler_context *ctx)172{173size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD);174175/* Bitfields for the various types of registers we could have. aluw can176* be written by either ALU or load/store */177178unsigned *alur = calloc(sz, 1);179unsigned *aluw = calloc(sz, 1);180unsigned *brar = calloc(sz, 1);181unsigned *ldst = calloc(sz, 1);182unsigned *texr = calloc(sz, 1);183unsigned *texw = calloc(sz, 1);184185/* Pass #1 is analysis, a linear scan to fill out the bitfields */186187mir_foreach_instr_global(ctx, ins) {188switch (ins->type) {189case TAG_ALU_4:190mark_node_class(aluw, ins->dest);191mark_node_class(alur, ins->src[0]);192mark_node_class(alur, ins->src[1]);193mark_node_class(alur, ins->src[2]);194195if (ins->compact_branch && ins->writeout)196mark_node_class(brar, ins->src[0]);197198break;199200case TAG_LOAD_STORE_4:201mark_node_class(aluw, ins->dest);202mark_node_class(ldst, ins->src[0]);203mark_node_class(ldst, ins->src[1]);204mark_node_class(ldst, ins->src[2]);205mark_node_class(ldst, ins->src[3]);206break;207208case TAG_TEXTURE_4:209mark_node_class(texr, ins->src[0]);210mark_node_class(texr, ins->src[1]);211mark_node_class(texr, ins->src[2]);212mark_node_class(texw, ins->dest);213break;214215default:216break;217}218}219220/* Pass #2 is lowering now that we've analyzed all the classes.221* Conceptually, if an index is only marked for a single type of use,222* there is nothing to lower. If it is marked for different uses, we223* split up based on the number of types of uses. To do so, we divide224* into N distinct classes of use (where N>1 by definition), emit N-1225* moves from the index to copies of the index, and finally rewrite N-1226* of the types of uses to use the corresponding move */227228unsigned spill_idx = ctx->temp_count;229230for (unsigned i = 0; i < ctx->temp_count; ++i) {231bool is_alur = BITSET_TEST(alur, i);232bool is_aluw = BITSET_TEST(aluw, i);233bool is_brar = BITSET_TEST(brar, i);234bool is_ldst = BITSET_TEST(ldst, i);235bool is_texr = BITSET_TEST(texr, i);236bool is_texw = BITSET_TEST(texw, i);237238/* Analyse to check how many distinct uses there are. ALU ops239* (alur) can read the results of the texture pipeline (texw)240* but not ldst or texr. Load/store ops (ldst) cannot read241* anything but load/store inputs. Texture pipeline cannot read242* anything but texture inputs. TODO: Simplify. */243244bool collision =245(is_alur && (is_ldst || is_texr)) ||246(is_ldst && (is_alur || is_texr || is_texw)) ||247(is_texr && (is_alur || is_ldst || is_texw)) ||248(is_texw && (is_aluw || is_ldst || is_texr)) ||249(is_brar && is_texw);250251if (!collision)252continue;253254/* Use the index as-is as the work copy. Emit copies for255* special uses */256257unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, TAG_ALU_4};258bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw, is_brar };259260for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) {261if (!collisions[j]) continue;262263/* When the hazard is from reading, we move and rewrite264* sources (typical case). When it's from writing, we265* flip the move and rewrite destinations (obscure,266* only from control flow -- impossible in SSA) */267268bool hazard_write = (j == 2);269270unsigned idx = spill_idx++;271272/* Insert move before each read/write, depending on the273* hazard we're trying to account for */274275mir_foreach_instr_global_safe(ctx, pre_use) {276if (pre_use->type != classes[j])277continue;278279if (hazard_write) {280if (pre_use->dest != i)281continue;282283midgard_instruction m = v_mov(idx, i);284m.dest_type = pre_use->dest_type;285m.src_types[1] = m.dest_type;286m.mask = pre_use->mask;287288midgard_instruction *use = mir_next_op(pre_use);289assert(use);290mir_insert_instruction_before(ctx, use, m);291mir_rewrite_index_dst_single(pre_use, i, idx);292} else {293if (!mir_has_arg(pre_use, i))294continue;295296idx = spill_idx++;297298midgard_instruction m = v_mov(i, idx);299m.mask = mir_from_bytemask(mir_round_bytemask_up(300mir_bytemask_of_read_components(pre_use, i), 32), 32);301mir_insert_instruction_before(ctx, pre_use, m);302mir_rewrite_index_src_single(pre_use, i, idx);303}304}305}306}307308free(alur);309free(aluw);310free(brar);311free(ldst);312free(texr);313free(texw);314}315316static void317mir_compute_interference(318compiler_context *ctx,319struct lcra_state *l)320{321/* First, we need liveness information to be computed per block */322mir_compute_liveness(ctx);323324/* We need to force r1.w live throughout a blend shader */325326if (ctx->inputs->is_blend) {327unsigned r1w = ~0;328329mir_foreach_block(ctx, _block) {330midgard_block *block = (midgard_block *) _block;331mir_foreach_instr_in_block_rev(block, ins) {332if (ins->writeout)333r1w = ins->dest;334}335336if (r1w != ~0)337break;338}339340mir_foreach_instr_global(ctx, ins) {341if (ins->dest < ctx->temp_count)342lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w, 0xF);343}344}345346/* Now that every block has live_in/live_out computed, we can determine347* interference by walking each block linearly. Take live_out at the348* end of each block and walk the block backwards. */349350mir_foreach_block(ctx, _blk) {351midgard_block *blk = (midgard_block *) _blk;352uint16_t *live = mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t));353354mir_foreach_instr_in_block_rev(blk, ins) {355/* Mark all registers live after the instruction as356* interfering with the destination */357358unsigned dest = ins->dest;359360if (dest < ctx->temp_count) {361for (unsigned i = 0; i < ctx->temp_count; ++i) {362if (live[i]) {363unsigned mask = mir_bytemask(ins);364lcra_add_node_interference(l, dest, mask, i, live[i]);365}366}367}368369/* Add blend shader interference: blend shaders might370* clobber r0-r3. */371if (ins->compact_branch && ins->writeout) {372for (unsigned i = 0; i < ctx->temp_count; ++i) {373if (!live[i])374continue;375376for (unsigned j = 0; j < 4; j++) {377lcra_add_node_interference(l, ctx->temp_count + j,3780xFFFF,379i, live[i]);380}381}382}383384/* Update live_in */385mir_liveness_ins_update(live, ins, ctx->temp_count);386}387388free(live);389}390}391392static bool393mir_is_64(midgard_instruction *ins)394{395if (nir_alu_type_get_type_size(ins->dest_type) == 64)396return true;397398mir_foreach_src(ins, v) {399if (nir_alu_type_get_type_size(ins->src_types[v]) == 64)400return true;401}402403return false;404}405406/* This routine performs the actual register allocation. It should be succeeded407* by install_registers */408409static struct lcra_state *410allocate_registers(compiler_context *ctx, bool *spilled)411{412/* The number of vec4 work registers available depends on the number of413* register-mapped uniforms and the shader stage. By ABI we limit blend414* shaders to 8 registers, should be lower XXX */415int rmu = ctx->info->push.count / 4;416int work_count = ctx->inputs->is_blend ? 8 : 16 - MAX2(rmu - 8, 0);417418/* No register allocation to do with no SSA */419420if (!ctx->temp_count)421return NULL;422423/* Initialize LCRA. Allocate extra node at the end for r1-r3 for424* interference */425426struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5);427unsigned node_r1 = ctx->temp_count + 1;428429/* Starts of classes, in bytes */430l->class_start[REG_CLASS_WORK] = 16 * 0;431l->class_start[REG_CLASS_LDST] = 16 * 26;432l->class_start[REG_CLASS_TEXR] = 16 * 28;433l->class_start[REG_CLASS_TEXW] = 16 * 28;434435l->class_size[REG_CLASS_WORK] = 16 * work_count;436l->class_size[REG_CLASS_LDST] = 16 * 2;437l->class_size[REG_CLASS_TEXR] = 16 * 2;438l->class_size[REG_CLASS_TEXW] = 16 * 2;439440lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW);441442/* To save space on T*20, we don't have real texture registers.443* Instead, tex inputs reuse the load/store pipeline registers, and444* tex outputs use work r0/r1. Note we still use TEXR/TEXW classes,445* noting that this handles interferences and sizes correctly. */446447if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) {448l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST];449l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK];450}451452unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);453unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count);454unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count);455456mir_foreach_instr_global(ctx, ins) {457/* Swizzles of 32-bit sources on 64-bit instructions need to be458* aligned to either bottom (xy) or top (zw). More general459* swizzle lowering should happen prior to scheduling (TODO),460* but once we get RA we shouldn't disrupt this further. Align461* sources of 64-bit instructions. */462463if (ins->type == TAG_ALU_4 && mir_is_64(ins)) {464mir_foreach_src(ins, v) {465unsigned s = ins->src[v];466467if (s < ctx->temp_count)468min_alignment[s] = 3;469}470}471472if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) {473mir_foreach_src(ins, v) {474unsigned s = ins->src[v];475unsigned size = nir_alu_type_get_type_size(ins->src_types[v]);476477if (s < ctx->temp_count)478min_alignment[s] = (size == 64) ? 3 : 2;479}480}481482if (ins->dest >= SSA_FIXED_MINIMUM) continue;483484unsigned size = nir_alu_type_get_type_size(ins->dest_type);485486if (ins->is_pack)487size = 32;488489/* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */490int comps1 = util_logbase2(ins->mask);491492int bytes = (comps1 + 1) * (size / 8);493494/* Use the largest class if there's ambiguity, this495* handles partial writes */496497int dest = ins->dest;498found_class[dest] = MAX2(found_class[dest], bytes);499500min_alignment[dest] =501(size == 16) ? 1 : /* (1 << 1) = 2-byte */502(size == 32) ? 2 : /* (1 << 2) = 4-byte */503(size == 64) ? 3 : /* (1 << 3) = 8-byte */5043; /* 8-bit todo */505506/* We can't cross xy/zw boundaries. TODO: vec8 can */507if (size == 16)508min_bound[dest] = 8;509510mir_foreach_src(ins, s) {511unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]);512if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM)513min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8);514}515516/* We don't have a swizzle for the conditional and we don't517* want to muck with the conditional itself, so just force518* alignment for now */519520if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) {521min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */522523/* LCRA assumes bound >= alignment */524min_bound[dest] = 16;525}526527/* Since ld/st swizzles and masks are 32-bit only, we need them528* aligned to enable final packing */529if (ins->type == TAG_LOAD_STORE_4)530min_alignment[dest] = MAX2(min_alignment[dest], 2);531}532533for (unsigned i = 0; i < ctx->temp_count; ++i) {534lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2,535min_bound[i] ? min_bound[i] : 16);536lcra_restrict_range(l, i, found_class[i]);537}538539free(found_class);540free(min_alignment);541free(min_bound);542543/* Next, we'll determine semantic class. We default to zero (work).544* But, if we're used with a special operation, that will force us to a545* particular class. Each node must be assigned to exactly one class; a546* prepass before RA should have lowered what-would-have-been547* multiclass nodes into a series of moves to break it up into multiple548* nodes (TODO) */549550mir_foreach_instr_global(ctx, ins) {551/* Check if this operation imposes any classes */552553if (ins->type == TAG_LOAD_STORE_4) {554set_class(l->class, ins->src[0], REG_CLASS_LDST);555set_class(l->class, ins->src[1], REG_CLASS_LDST);556set_class(l->class, ins->src[2], REG_CLASS_LDST);557set_class(l->class, ins->src[3], REG_CLASS_LDST);558559if (OP_IS_VEC4_ONLY(ins->op)) {560lcra_restrict_range(l, ins->dest, 16);561lcra_restrict_range(l, ins->src[0], 16);562lcra_restrict_range(l, ins->src[1], 16);563lcra_restrict_range(l, ins->src[2], 16);564lcra_restrict_range(l, ins->src[3], 16);565}566} else if (ins->type == TAG_TEXTURE_4) {567set_class(l->class, ins->dest, REG_CLASS_TEXW);568set_class(l->class, ins->src[0], REG_CLASS_TEXR);569set_class(l->class, ins->src[1], REG_CLASS_TEXR);570set_class(l->class, ins->src[2], REG_CLASS_TEXR);571set_class(l->class, ins->src[3], REG_CLASS_TEXR);572}573}574575/* Check that the semantics of the class are respected */576mir_foreach_instr_global(ctx, ins) {577assert(check_write_class(l->class, ins->type, ins->dest));578assert(check_read_class(l->class, ins->type, ins->src[0]));579assert(check_read_class(l->class, ins->type, ins->src[1]));580assert(check_read_class(l->class, ins->type, ins->src[2]));581assert(check_read_class(l->class, ins->type, ins->src[3]));582}583584/* Mark writeout to r0, depth to r1.x, stencil to r1.y,585* render target to r1.z, unknown to r1.w */586mir_foreach_instr_global(ctx, ins) {587if (!(ins->compact_branch && ins->writeout)) continue;588589if (ins->src[0] < ctx->temp_count)590l->solutions[ins->src[0]] = 0;591592if (ins->src[2] < ctx->temp_count)593l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4;594595if (ins->src[3] < ctx->temp_count)596l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4;597598if (ins->src[1] < ctx->temp_count)599l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4;600601if (ins->dest < ctx->temp_count)602l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4;603}604605/* Destinations of instructions in a writeout block cannot be assigned606* to r1 unless they are actually used as r1 from the writeout itself,607* since the writes to r1 are special. A code sequence like:608*609* sadd.fmov r1.x, [...]610* vadd.fadd r0, r1, r2611* [writeout branch]612*613* will misbehave since the r1.x write will be interpreted as a614* gl_FragDepth write so it won't show up correctly when r1 is read in615* the following segment. We model this as interference.616*/617618for (unsigned i = 0; i < 4; ++i)619l->solutions[ctx->temp_count + i] = (16 * i);620621mir_foreach_block(ctx, _blk) {622midgard_block *blk = (midgard_block *) _blk;623624mir_foreach_bundle_in_block(blk, v) {625/* We need at least a writeout and nonwriteout instruction */626if (v->instruction_count < 2)627continue;628629/* Branches always come at the end */630midgard_instruction *br = v->instructions[v->instruction_count - 1];631632if (!br->writeout)633continue;634635for (signed i = v->instruction_count - 2; i >= 0; --i) {636midgard_instruction *ins = v->instructions[i];637638if (ins->dest >= ctx->temp_count)639continue;640641bool used_as_r1 = (br->dest == ins->dest);642643mir_foreach_src(br, s)644used_as_r1 |= (s > 0) && (br->src[s] == ins->dest);645646if (!used_as_r1)647lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), node_r1, 0xFFFF);648}649}650}651652/* Precolour blend input to r0. Note writeout is necessarily at the end653* and blend shaders are single-RT only so there is only a single654* writeout block, so this cannot conflict with the writeout r0 (there655* is no need to have an intermediate move) */656657if (ctx->blend_input != ~0) {658assert(ctx->blend_input < ctx->temp_count);659l->solutions[ctx->blend_input] = 0;660}661662/* Same for the dual-source blend input/output, except here we use r2,663* which is also set in the fragment shader. */664665if (ctx->blend_src1 != ~0) {666assert(ctx->blend_src1 < ctx->temp_count);667l->solutions[ctx->blend_src1] = (16 * 2);668ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3);669}670671mir_compute_interference(ctx, l);672673*spilled = !lcra_solve(l);674return l;675}676677678/* Once registers have been decided via register allocation679* (allocate_registers), we need to rewrite the MIR to use registers instead of680* indices */681682static void683install_registers_instr(684compiler_context *ctx,685struct lcra_state *l,686midgard_instruction *ins)687{688unsigned src_shift[MIR_SRC_COUNT];689690for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) {691src_shift[i] =692util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8);693}694695unsigned dest_shift =696util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8);697698switch (ins->type) {699case TAG_ALU_4:700case TAG_ALU_8:701case TAG_ALU_12:702case TAG_ALU_16: {703if (ins->compact_branch)704return;705706struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]);707struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]);708struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);709710mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);711712unsigned dest_offset =713GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 :714dest.offset;715716offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift, dest_offset);717if (!ins->has_inline_constant)718offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift, dest_offset);719if (ins->src[0] != ~0)720ins->src[0] = SSA_FIXED_REGISTER(src1.reg);721if (ins->src[1] != ~0)722ins->src[1] = SSA_FIXED_REGISTER(src2.reg);723if (ins->dest != ~0)724ins->dest = SSA_FIXED_REGISTER(dest.reg);725break;726}727728case TAG_LOAD_STORE_4: {729/* Which physical register we read off depends on730* whether we are loading or storing -- think about the731* logical dataflow */732733bool encodes_src = OP_IS_STORE(ins->op);734735if (encodes_src) {736struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]);737assert(src.reg == 26 || src.reg == 27);738739ins->src[0] = SSA_FIXED_REGISTER(src.reg);740offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0);741} else {742struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift);743744ins->dest = SSA_FIXED_REGISTER(dst.reg);745offset_swizzle(ins->swizzle[0], 0, 2, 2, dst.offset);746mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset);747}748749/* We also follow up by actual arguments */750751for (int i = 1; i <= 3; i++) {752unsigned src_index = ins->src[i];753if (src_index != ~0) {754struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]);755unsigned component = src.offset >> src.shift;756assert(component << src.shift == src.offset);757ins->src[i] = SSA_FIXED_REGISTER(src.reg);758ins->swizzle[i][0] += component;759}760}761762break;763}764765case TAG_TEXTURE_4: {766if (ins->op == midgard_tex_op_barrier)767break;768769/* Grab RA results */770struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);771struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]);772struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]);773struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]);774775/* First, install the texture coordinate */776if (ins->src[1] != ~0)777ins->src[1] = SSA_FIXED_REGISTER(coord.reg);778offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0);779780/* Next, install the destination */781if (ins->dest != ~0)782ins->dest = SSA_FIXED_REGISTER(dest.reg);783offset_swizzle(ins->swizzle[0], 0, 2, dest.shift,784dest_shift == 1 ? dest.offset % 8 :785dest.offset);786mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);787788/* If there is a register LOD/bias, use it */789if (ins->src[2] != ~0) {790assert(!(lod.offset & 3));791ins->src[2] = SSA_FIXED_REGISTER(lod.reg);792ins->swizzle[2][0] = lod.offset / 4;793}794795/* If there is an offset register, install it */796if (ins->src[3] != ~0) {797ins->src[3] = SSA_FIXED_REGISTER(offset.reg);798ins->swizzle[3][0] = offset.offset / 4;799}800801break;802}803804default:805break;806}807}808809static void810install_registers(compiler_context *ctx, struct lcra_state *l)811{812mir_foreach_instr_global(ctx, ins)813install_registers_instr(ctx, l, ins);814}815816817/* If register allocation fails, find the best spill node */818819static signed820mir_choose_spill_node(821compiler_context *ctx,822struct lcra_state *l)823{824/* We can't spill a previously spilled value or an unspill */825826mir_foreach_instr_global(ctx, ins) {827if (ins->no_spill & (1 << l->spill_class)) {828lcra_set_node_spill_cost(l, ins->dest, -1);829830if (l->spill_class != REG_CLASS_WORK) {831mir_foreach_src(ins, s)832lcra_set_node_spill_cost(l, ins->src[s], -1);833}834}835}836837return lcra_get_best_spill_node(l);838}839840/* Once we've chosen a spill node, spill it */841842static void843mir_spill_register(844compiler_context *ctx,845unsigned spill_node,846unsigned spill_class,847unsigned *spill_count)848{849if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend)850unreachable("Blend shader spilling is currently unimplemented");851852unsigned spill_index = ctx->temp_count;853854/* We have a spill node, so check the class. Work registers855* legitimately spill to TLS, but special registers just spill to work856* registers */857858bool is_special = spill_class != REG_CLASS_WORK;859bool is_special_w = spill_class == REG_CLASS_TEXW;860861/* Allocate TLS slot (maybe) */862unsigned spill_slot = !is_special ? (*spill_count)++ : 0;863864/* For special reads, figure out how many bytes we need */865unsigned read_bytemask = 0;866867/* If multiple instructions write to this destination, we'll have to868* fill from TLS before writing */869unsigned write_count = 0;870871mir_foreach_instr_global_safe(ctx, ins) {872read_bytemask |= mir_bytemask_of_read_components(ins, spill_node);873if (ins->dest == spill_node)874++write_count;875}876877/* For TLS, replace all stores to the spilled node. For878* special reads, just keep as-is; the class will be demoted879* implicitly. For special writes, spill to a work register */880881if (!is_special || is_special_w) {882if (is_special_w)883spill_slot = spill_index++;884885mir_foreach_block(ctx, _block) {886midgard_block *block = (midgard_block *) _block;887mir_foreach_instr_in_block_safe(block, ins) {888if (ins->dest != spill_node) continue;889890/* Note: it's important to match the mask of the spill891* with the mask of the instruction whose destination892* we're spilling, or otherwise we'll read invalid893* components and can fail RA in a subsequent iteration894*/895896if (is_special_w) {897midgard_instruction st = v_mov(spill_node, spill_slot);898st.no_spill |= (1 << spill_class);899st.mask = ins->mask;900st.dest_type = st.src_types[1] = ins->dest_type;901902/* Hint: don't rewrite this node */903st.hint = true;904905mir_insert_instruction_after_scheduled(ctx, block, ins, st);906} else {907unsigned dest = spill_index++;908909if (write_count > 1 && mir_bytemask(ins) != 0xF) {910midgard_instruction read =911v_load_store_scratch(dest, spill_slot, false, 0xF);912mir_insert_instruction_before_scheduled(ctx, block, ins, read);913}914915ins->dest = dest;916ins->no_spill |= (1 << spill_class);917918bool move = false;919920/* In the same bundle, reads of the destination921* of the spilt instruction need to be direct */922midgard_instruction *it = ins;923while ((it = list_first_entry(&it->link, midgard_instruction, link))924&& (it->bundle_id == ins->bundle_id)) {925926if (!mir_has_arg(it, spill_node)) continue;927928mir_rewrite_index_src_single(it, spill_node, dest);929930/* The spilt instruction will write to931* a work register for `it` to read but932* the spill needs an LD/ST register */933move = true;934}935936if (move)937dest = spill_index++;938939midgard_instruction st =940v_load_store_scratch(dest, spill_slot, true, ins->mask);941mir_insert_instruction_after_scheduled(ctx, block, ins, st);942943if (move) {944midgard_instruction mv = v_mov(ins->dest, dest);945mv.no_spill |= (1 << spill_class);946947mir_insert_instruction_after_scheduled(ctx, block, ins, mv);948}949}950951if (!is_special)952ctx->spills++;953}954}955}956957/* Insert a load from TLS before the first consecutive958* use of the node, rewriting to use spilled indices to959* break up the live range. Or, for special, insert a960* move. Ironically the latter *increases* register961* pressure, but the two uses of the spilling mechanism962* are somewhat orthogonal. (special spilling is to use963* work registers to back special registers; TLS964* spilling is to use memory to back work registers) */965966mir_foreach_block(ctx, _block) {967midgard_block *block = (midgard_block *) _block;968mir_foreach_instr_in_block(block, ins) {969/* We can't rewrite the moves used to spill in the970* first place. These moves are hinted. */971if (ins->hint) continue;972973/* If we don't use the spilled value, nothing to do */974if (!mir_has_arg(ins, spill_node)) continue;975976unsigned index = 0;977978if (!is_special_w) {979index = ++spill_index;980981midgard_instruction *before = ins;982midgard_instruction st;983984if (is_special) {985/* Move */986st = v_mov(spill_node, index);987st.no_spill |= (1 << spill_class);988} else {989/* TLS load */990st = v_load_store_scratch(index, spill_slot, false, 0xF);991}992993/* Mask the load based on the component count994* actually needed to prevent RA loops */995996st.mask = mir_from_bytemask(mir_round_bytemask_up(997read_bytemask, 32), 32);998999mir_insert_instruction_before_scheduled(ctx, block, before, st);1000} else {1001/* Special writes already have their move spilled in */1002index = spill_slot;1003}100410051006/* Rewrite to use */1007mir_rewrite_index_src_single(ins, spill_node, index);10081009if (!is_special)1010ctx->fills++;1011}1012}10131014/* Reset hints */10151016mir_foreach_instr_global(ctx, ins) {1017ins->hint = false;1018}1019}10201021static void1022mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff)1023{1024unsigned uniforms = ctx->info->push.count / 4;1025unsigned old_work_count = 16 - MAX2(uniforms - 8, 0);1026unsigned work_count = 16 - MAX2((new_cutoff - 8), 0);10271028unsigned min_demote = SSA_FIXED_REGISTER(old_work_count);1029unsigned max_demote = SSA_FIXED_REGISTER(work_count);10301031mir_foreach_block(ctx, _block) {1032midgard_block *block = (midgard_block *) _block;1033mir_foreach_instr_in_block(block, ins) {1034mir_foreach_src(ins, i) {1035if (ins->src[i] < min_demote || ins->src[i] >= max_demote)1036continue;10371038midgard_instruction *before = ins;10391040unsigned temp = make_compiler_temp(ctx);1041unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4;1042assert(idx < ctx->info->push.count);10431044ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo);10451046midgard_instruction ld = {1047.type = TAG_LOAD_STORE_4,1048.mask = 0xF,1049.dest = temp,1050.dest_type = ins->src_types[i],1051.src = { ~0, ~0, ~0, ~0 },1052.swizzle = SWIZZLE_IDENTITY_4,1053.op = midgard_op_ld_ubo_128,1054.load_store = {1055.index_reg = REGISTER_LDST_ZERO,1056},1057.constants.u32[0] = ctx->info->push.words[idx].offset1058};10591060midgard_pack_ubo_index_imm(&ld.load_store,1061ctx->info->push.words[idx].ubo);10621063mir_insert_instruction_before_scheduled(ctx, block, before, ld);10641065mir_rewrite_index_src_single(ins, ins->src[i], temp);1066}1067}1068}10691070ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4);1071}10721073/* Run register allocation in a loop, spilling until we succeed */10741075void1076mir_ra(compiler_context *ctx)1077{1078struct lcra_state *l = NULL;1079bool spilled = false;1080int iter_count = 1000; /* max iterations */10811082/* Number of 128-bit slots in memory we've spilled into */1083unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16);108410851086mir_create_pipeline_registers(ctx);10871088do {1089if (spilled) {1090signed spill_node = mir_choose_spill_node(ctx, l);1091unsigned uniforms = ctx->info->push.count / 4;10921093/* It's a lot cheaper to demote uniforms to get more1094* work registers than to spill to TLS. */1095if (l->spill_class == REG_CLASS_WORK && uniforms > 8) {1096mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8));1097} else if (spill_node == -1) {1098fprintf(stderr, "ERROR: Failed to choose spill node\n");1099lcra_free(l);1100return;1101} else {1102mir_spill_register(ctx, spill_node, l->spill_class, &spill_count);1103}1104}11051106mir_squeeze_index(ctx);1107mir_invalidate_liveness(ctx);11081109if (l) {1110lcra_free(l);1111l = NULL;1112}11131114l = allocate_registers(ctx, &spilled);1115} while(spilled && ((iter_count--) > 0));11161117if (iter_count <= 0) {1118fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");1119assert(0);1120}11211122/* Report spilling information. spill_count is in 128-bit slots (vec4 x1123* fp32), but tls_size is in bytes, so multiply by 16 */11241125ctx->info->tls_size = spill_count * 16;11261127install_registers(ctx, l);11281129lcra_free(l);1130}113111321133