Path: blob/21.2-virgl/src/panfrost/midgard/mir_promote_uniforms.c
4564 views
/*1* Copyright (C) 2019 Collabora, Ltd.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors (Collabora):23* Alyssa Rosenzweig <[email protected]>24*/2526#include "compiler.h"27#include "util/u_math.h"28#include "util/u_memory.h"2930/* This pass promotes reads from UBOs to register-mapped uniforms. This saves31* both instructions and work register pressure, but it reduces the work32* registers available, requiring a balance.33*34* We use a heuristic to determine the ideal count, implemented by35* mir_work_heuristic, which returns the ideal number of work registers.36*/3738static bool39mir_is_ubo(midgard_instruction *ins)40{41return (ins->type == TAG_LOAD_STORE_4) &&42(OP_IS_UBO_READ(ins->op));43}4445static bool46mir_is_direct_aligned_ubo(midgard_instruction *ins)47{48return mir_is_ubo(ins) &&49!(ins->constants.u32[0] & 0xF) &&50(ins->src[1] == ~0) &&51(ins->src[2] == ~0);52}5354/* Represents use data for a single UBO */5556#define MAX_UBO_QWORDS (65536 / 16)5758struct mir_ubo_block {59BITSET_DECLARE(uses, MAX_UBO_QWORDS);60BITSET_DECLARE(pushed, MAX_UBO_QWORDS);61};6263struct mir_ubo_analysis {64/* Per block analysis */65unsigned nr_blocks;66struct mir_ubo_block *blocks;67};6869static struct mir_ubo_analysis70mir_analyze_ranges(compiler_context *ctx)71{72struct mir_ubo_analysis res = {73.nr_blocks = ctx->nir->info.num_ubos + 1,74};7576res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));7778mir_foreach_instr_global(ctx, ins) {79if (!mir_is_direct_aligned_ubo(ins)) continue;8081unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);82unsigned offset = ins->constants.u32[0] / 16;8384assert(ubo < res.nr_blocks);8586if (offset < MAX_UBO_QWORDS)87BITSET_SET(res.blocks[ubo].uses, offset);88}8990return res;91}9293/* Select UBO words to push. A sophisticated implementation would consider the94* number of uses and perhaps the control flow to estimate benefit. This is not95* sophisticated. Select from the last UBO first to prioritize sysvals. */9697static void98mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords)99{100unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4);101102for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {103struct mir_ubo_block *block = &analysis->blocks[ubo];104105unsigned vec4;106BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) {107/* Don't push more than possible */108if (push->count > max_words - 4)109return;110111for (unsigned offs = 0; offs < 4; ++offs) {112struct panfrost_ubo_word word = {113.ubo = ubo,114.offset = (vec4 * 16) + (offs * 4)115};116117push->words[push->count++] = word;118}119120/* Mark it as pushed so we can rewrite */121BITSET_SET(block->pushed, vec4);122}123}124}125126#if 0127static void128mir_dump_ubo_analysis(struct mir_ubo_analysis *res)129{130printf("%u blocks\n", res->nr_blocks);131132for (unsigned i = 0; i < res->nr_blocks; ++i) {133BITSET_WORD *uses = res->blocks[i].uses;134BITSET_WORD *push = res->blocks[i].pushed;135136unsigned last = BITSET_LAST_BIT_SIZED(uses, BITSET_WORDS(MAX_UBO_QWORDS));137138printf("\t");139140for (unsigned j = 0; j < last; ++j) {141bool used = BITSET_TEST(uses, j);142bool pushed = BITSET_TEST(push, j);143assert(used || !pushed);144145putchar(pushed ? '*' : used ? '-' : '_');146}147148printf("\n");149}150}151#endif152153static unsigned154mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis)155{156unsigned count = 0;157158for (unsigned i = 0; i < analysis->nr_blocks; ++i) {159BITSET_WORD *uses = analysis->blocks[i].uses;160161for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w)162count += util_bitcount(uses[w]);163}164165return count;166}167168static unsigned169mir_count_live(uint16_t *live, unsigned temp_count)170{171unsigned count = 0;172173for (unsigned i = 0; i < temp_count; ++i)174count += util_bitcount(live[i]);175176return count;177}178179static unsigned180mir_estimate_pressure(compiler_context *ctx)181{182mir_invalidate_liveness(ctx);183mir_compute_liveness(ctx);184185unsigned max_live = 0;186187mir_foreach_block(ctx, _block) {188midgard_block *block = (midgard_block *) _block;189uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));190191mir_foreach_instr_in_block_rev(block, ins) {192unsigned count = mir_count_live(live, ctx->temp_count);193max_live = MAX2(max_live, count);194mir_liveness_ins_update(live, ins, ctx->temp_count);195}196197free(live);198}199200return DIV_ROUND_UP(max_live, 16);201}202203static unsigned204mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis)205{206unsigned uniform_count = mir_promoteable_uniform_count(analysis);207208/* If there are 8 or fewer uniforms, it doesn't matter what we do, so209* allow as many work registers as needed */210211if (uniform_count <= 8)212return 16;213214/* Otherwise, estimate the register pressure */215216unsigned pressure = mir_estimate_pressure(ctx);217218/* Prioritize not spilling above all else. The relation between the219* pressure estimate and the actual register pressure is a little220* murkier than we might like (due to scheduling, pipeline registers,221* failure to pack vector registers, load/store registers, texture222* registers...), hence why this is a heuristic parameter */223224if (pressure > 6)225return 16;226227/* If there's no chance of spilling, prioritize UBOs and thread count */228229return 8;230}231232/* Bitset of indices that will be used as a special register -- inputs to a233* non-ALU op. We precompute this set so that testing is efficient, otherwise234* we end up O(mn) behaviour for n instructions and m uniform reads */235236static BITSET_WORD *237mir_special_indices(compiler_context *ctx)238{239mir_compute_temp_count(ctx);240BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));241242mir_foreach_instr_global(ctx, ins) {243/* Look for special instructions */244bool is_ldst = ins->type == TAG_LOAD_STORE_4;245bool is_tex = ins->type == TAG_TEXTURE_4;246bool is_writeout = ins->compact_branch && ins->writeout;247248if (!(is_ldst || is_tex || is_writeout))249continue;250251/* Anything read by a special instruction is itself special */252mir_foreach_src(ins, i) {253unsigned idx = ins->src[i];254255if (idx < ctx->temp_count)256BITSET_SET(bset, idx);257}258}259260return bset;261}262263void264midgard_promote_uniforms(compiler_context *ctx)265{266if (ctx->inputs->no_ubo_to_push) {267/* If nothing is pushed, all UBOs need to be uploaded268* conventionally */269ctx->ubo_mask = ~0;270return;271}272273struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);274275unsigned work_count = mir_work_heuristic(ctx, &analysis);276unsigned promoted_count = 24 - work_count;277278/* Ensure we are 16 byte aligned to avoid underallocations */279mir_pick_ubo(&ctx->info->push, &analysis, promoted_count);280ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4);281282/* First, figure out special indices a priori so we don't recompute a lot */283BITSET_WORD *special = mir_special_indices(ctx);284285ctx->ubo_mask = 0;286287mir_foreach_instr_global_safe(ctx, ins) {288if (!mir_is_ubo(ins)) continue;289290unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);291unsigned qword = ins->constants.u32[0] / 16;292293if (!mir_is_direct_aligned_ubo(ins)) {294if (ins->src[1] == ~0)295ctx->ubo_mask |= BITSET_BIT(ubo);296else297ctx->ubo_mask = ~0;298299continue;300}301302/* Check if we decided to push this */303assert(ubo < analysis.nr_blocks);304if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) {305ctx->ubo_mask |= BITSET_BIT(ubo);306continue;307}308309/* Find where we pushed to, TODO: unaligned pushes to pack */310unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16);311assert((base & 0x3) == 0);312313unsigned address = base / 4;314unsigned uniform_reg = 23 - address;315316/* Should've taken into account when pushing */317assert(address < promoted_count);318unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);319320/* We do need the move for safety for a non-SSA dest, or if321* we're being fed into a special class */322323bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;324325if (ins->dest < ctx->temp_count)326needs_move |= BITSET_TEST(special, ins->dest);327328if (needs_move) {329unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);330midgard_instruction mov = v_mov(promoted, ins->dest);331mov.dest_type = nir_type_uint | type_size;332mov.src_types[1] = mov.dest_type;333334uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);335mir_set_bytemask(&mov, rounded);336mir_insert_instruction_before(ctx, ins, mov);337} else {338mir_rewrite_index_src(ctx, ins->dest, promoted);339}340341mir_remove_instruction(ins);342}343344free(special);345free(analysis.blocks);346}347348349