Path: blob/21.2-virgl/src/freedreno/ir3/ir3_lower_parallelcopy.c
4565 views
/*1* Copyright (C) 2021 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#include "ir3_ra.h"24#include "ir3_shader.h"2526struct copy_src {27unsigned flags;28union {29uint32_t imm;30physreg_t reg;31unsigned const_num;32};33};3435struct copy_entry {36physreg_t dst;37unsigned flags;38bool done;3940struct copy_src src;41};4243static unsigned44copy_entry_size(const struct copy_entry *entry)45{46return (entry->flags & IR3_REG_HALF) ? 1 : 2;47}4849static struct copy_src50get_copy_src(const struct ir3_register *reg, unsigned offset)51{52if (reg->flags & IR3_REG_IMMED) {53return (struct copy_src){54.flags = IR3_REG_IMMED,55.imm = reg->uim_val,56};57} else if (reg->flags & IR3_REG_CONST) {58return (struct copy_src){59.flags = IR3_REG_CONST,60.const_num = reg->num,61};62} else {63return (struct copy_src){64.flags = 0,65.reg = ra_reg_get_physreg(reg) + offset,66};67}68}6970static void71do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,72unsigned src2_num, unsigned flags)73{74struct ir3_instruction * xor75= ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);76ir3_dst_create(xor, dst_num, flags);77ir3_src_create(xor, src1_num, flags);78ir3_src_create(xor, src2_num, flags);7980ir3_instr_move_before(xor, instr);81}8283static void84do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,85const struct copy_entry *entry)86{87assert(!entry->src.flags);8889if (entry->flags & IR3_REG_HALF) {90/* We currently make sure to never emit parallel copies where the91* source/destination is a half-reg above the range accessable to half92* registers. However, when a full-reg source overlaps a half-reg93* destination or vice versa, it can be very, very complicated to come94* up with a series of "legal" swaps and copies to resolve the95* parallel copy. So here we provide a fallback to implement the96* "illegal" swap instead. This may also be useful for implementing97* "spilling" half-regs to the inaccessable space.98*/99if (entry->src.reg >= RA_HALF_SIZE) {100/* Choose a temporary that doesn't overlap src or dst */101physreg_t tmp = entry->dst < 2 ? 2 : 0;102103/* Swap src and the temporary */104do_swap(compiler, instr,105&(struct copy_entry){106.src = {.reg = entry->src.reg & ~1u},107.dst = tmp,108.flags = entry->flags & ~IR3_REG_HALF,109});110111/* Do the original swap with src replaced with tmp */112do_swap(compiler, instr,113&(struct copy_entry){114.src = {.reg = tmp + (entry->src.reg & 1)},115.dst = entry->dst,116.flags = entry->flags,117});118119/* Swap src and the temporary back */120do_swap(compiler, instr,121&(struct copy_entry){122.src = {.reg = entry->src.reg & ~1u},123.dst = tmp,124.flags = entry->flags & ~IR3_REG_HALF,125});126return;127}128129/* If dst is not addressable, we only need to swap the arguments and130* let the case above handle it.131*/132if (entry->dst >= RA_HALF_SIZE) {133do_swap(compiler, instr,134&(struct copy_entry){135.src = {.reg = entry->dst},136.dst = entry->src.reg,137.flags = entry->flags,138});139return;140}141}142143unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);144unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);145146/* a5xx+ is known to support swz, which enables us to swap two registers147* in-place. If unsupported we emulate it using the xor trick.148*/149if (compiler->gpu_id < 500) {150/* Shared regs only exist since a5xx, so we don't have to provide a151* fallback path for them.152*/153assert(!(entry->flags & IR3_REG_SHARED));154do_xor(instr, dst_num, dst_num, src_num, entry->flags);155do_xor(instr, src_num, src_num, dst_num, entry->flags);156do_xor(instr, dst_num, dst_num, src_num, entry->flags);157} else {158/* Use a macro for shared regs because any shared reg writes need to159* be wrapped in a getone block to work correctly. Writing shared regs160* with multiple threads active does not work, even if they all return161* the same value.162*/163unsigned opc =164(entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;165struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);166ir3_dst_create(swz, dst_num, entry->flags);167ir3_dst_create(swz, src_num, entry->flags);168ir3_src_create(swz, src_num, entry->flags);169ir3_src_create(swz, dst_num, entry->flags);170swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;171swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;172swz->repeat = 1;173ir3_instr_move_before(swz, instr);174}175}176177static void178do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,179const struct copy_entry *entry)180{181if (entry->flags & IR3_REG_HALF) {182/* See do_swap() for why this is here. */183if (entry->dst >= RA_HALF_SIZE) {184/* TODO: is there a hw instruction we can use for this case? */185physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;186187do_swap(compiler, instr,188&(struct copy_entry){189.src = {.reg = entry->dst & ~1u},190.dst = tmp,191.flags = entry->flags & ~IR3_REG_HALF,192});193194do_copy(compiler, instr,195&(struct copy_entry){196.src = entry->src,197.dst = tmp + (entry->dst & 1),198.flags = entry->flags,199});200201do_swap(compiler, instr,202&(struct copy_entry){203.src = {.reg = entry->dst & ~1u},204.dst = tmp,205.flags = entry->flags & ~IR3_REG_HALF,206});207return;208}209210if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {211unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,212entry->flags & ~IR3_REG_HALF);213unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);214215if (entry->src.reg % 2 == 0) {216/* cov.u32u16 dst, src */217struct ir3_instruction *cov =218ir3_instr_create(instr->block, OPC_MOV, 1, 1);219ir3_dst_create(cov, dst_num, entry->flags);220ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);221cov->cat1.dst_type = TYPE_U16;222cov->cat1.src_type = TYPE_U32;223ir3_instr_move_before(cov, instr);224} else {225/* shr.b dst, src, h(16) */226struct ir3_instruction *shr =227ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);228ir3_dst_create(shr, dst_num, entry->flags);229ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);230ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;231ir3_instr_move_before(shr, instr);232}233return;234}235}236237unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);238unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);239240/* Similar to the swap case, we have to use a macro for shared regs. */241unsigned opc =242(entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;243struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);244ir3_dst_create(mov, dst_num, entry->flags);245ir3_src_create(mov, src_num, entry->flags | entry->src.flags);246mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;247mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;248if (entry->src.flags & IR3_REG_IMMED)249mov->srcs[0]->uim_val = entry->src.imm;250else if (entry->src.flags & IR3_REG_CONST)251mov->srcs[0]->num = entry->src.const_num;252ir3_instr_move_before(mov, instr);253}254255struct copy_ctx {256/* For each physreg, the number of pending copy entries that use it as a257* source. Once this drops to zero, then the physreg is unblocked and can258* be moved to.259*/260unsigned physreg_use_count[RA_MAX_FILE_SIZE];261262/* For each physreg, the pending copy_entry that uses it as a dest. */263struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];264265struct copy_entry entries[RA_MAX_FILE_SIZE];266unsigned entry_count;267};268269static bool270entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)271{272for (unsigned i = 0; i < copy_entry_size(entry); i++) {273if (ctx->physreg_use_count[entry->dst + i] != 0)274return true;275}276277return false;278}279280static void281split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)282{283assert(!entry->done);284assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));285assert(copy_entry_size(entry) == 2);286struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];287288new_entry->dst = entry->dst + 1;289new_entry->src.flags = entry->src.flags;290new_entry->src.reg = entry->src.reg + 1;291new_entry->done = false;292entry->flags |= IR3_REG_HALF;293new_entry->flags = entry->flags;294ctx->physreg_dst[entry->dst + 1] = new_entry;295}296297static void298_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,299struct copy_ctx *ctx)300{301/* Set up the bookkeeping */302memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));303memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));304305for (unsigned i = 0; i < ctx->entry_count; i++) {306struct copy_entry *entry = &ctx->entries[i];307for (unsigned j = 0; j < copy_entry_size(entry); j++) {308if (!entry->src.flags)309ctx->physreg_use_count[entry->src.reg + j]++;310311/* Copies should not have overlapping destinations. */312assert(!ctx->physreg_dst[entry->dst + j]);313ctx->physreg_dst[entry->dst + j] = entry;314}315}316317bool progress = true;318while (progress) {319progress = false;320321/* Step 1: resolve paths in the transfer graph. This means finding322* copies whose destination aren't blocked by something else and then323* emitting them, continuing this process until every copy is blocked324* and there are only cycles left.325*326* TODO: We should note that src is also available in dst to unblock327* cycles that src is involved in.328*/329330for (unsigned i = 0; i < ctx->entry_count; i++) {331struct copy_entry *entry = &ctx->entries[i];332if (!entry->done && !entry_blocked(entry, ctx)) {333entry->done = true;334progress = true;335do_copy(compiler, instr, entry);336for (unsigned j = 0; j < copy_entry_size(entry); j++) {337if (!entry->src.flags)338ctx->physreg_use_count[entry->src.reg + j]--;339ctx->physreg_dst[entry->dst + j] = NULL;340}341}342}343344if (progress)345continue;346347/* Step 2: Find partially blocked copies and split them. In the348* mergedregs case, we can 32-bit copies which are only blocked on one349* 16-bit half, and splitting them helps get things moving.350*351* We can skip splitting copies if the source isn't a register,352* however, because it does not unblock anything and therefore doesn't353* contribute to making forward progress with step 1. These copies354* should still be resolved eventually in step 1 because they can't be355* part of a cycle.356*/357for (unsigned i = 0; i < ctx->entry_count; i++) {358struct copy_entry *entry = &ctx->entries[i];359if (entry->done || entry->flags & IR3_REG_HALF)360continue;361362if (((ctx->physreg_use_count[entry->dst] == 0 ||363ctx->physreg_use_count[entry->dst + 1] == 0)) &&364!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {365split_32bit_copy(ctx, entry);366progress = true;367}368}369}370371/* Step 3: resolve cycles through swapping.372*373* At this point, the transfer graph should consist of only cycles.374* The reason is that, given any physreg n_1 that's the source of a375* remaining entry, it has a destination n_2, which (because every376* copy is blocked) is the source of some other copy whose destination377* is n_3, and so we can follow the chain until we get a cycle. If we378* reached some other node than n_1:379*380* n_1 -> n_2 -> ... -> n_i381* ^ |382* |-------------|383*384* then n_2 would be the destination of 2 copies, which is illegal385* (checked above in an assert). So n_1 must be part of a cycle:386*387* n_1 -> n_2 -> ... -> n_i388* ^ |389* |---------------------|390*391* and this must be only cycle n_1 is involved in, because any other392* path starting from n_1 would also have to end in n_1, resulting in393* a node somewhere along the way being the destination of 2 copies394* when the 2 paths merge.395*396* The way we resolve the cycle is through picking a copy (n_1, n_2)397* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken398* out of the cycle:399*400* n_1 -> ... -> n_i401* ^ |402* |--------------|403*404* and we can keep repeating this until the cycle is empty.405*/406407for (unsigned i = 0; i < ctx->entry_count; i++) {408struct copy_entry *entry = &ctx->entries[i];409if (entry->done)410continue;411412assert(!entry->src.flags);413414/* catch trivial copies */415if (entry->dst == entry->src.reg) {416entry->done = true;417continue;418}419420do_swap(compiler, instr, entry);421422/* Split any blocking copies whose sources are only partially423* contained within our destination.424*/425if (entry->flags & IR3_REG_HALF) {426for (unsigned j = 0; j < ctx->entry_count; j++) {427struct copy_entry *blocking = &ctx->entries[j];428429if (blocking->done)430continue;431432if (blocking->src.reg <= entry->dst &&433blocking->src.reg + 1 >= entry->dst &&434!(blocking->flags & IR3_REG_HALF)) {435split_32bit_copy(ctx, blocking);436}437}438}439440/* Update sources of blocking copies.441*442* Note: at this point, every blocking copy's source should be443* contained within our destination.444*/445for (unsigned j = 0; j < ctx->entry_count; j++) {446struct copy_entry *blocking = &ctx->entries[j];447if (blocking->src.reg >= entry->dst &&448blocking->src.reg < entry->dst + copy_entry_size(entry)) {449blocking->src.reg =450entry->src.reg + (blocking->src.reg - entry->dst);451}452}453}454}455456static void457handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,458struct copy_entry *entries, unsigned entry_count)459{460struct copy_ctx ctx;461462/* handle shared copies first */463ctx.entry_count = 0;464for (unsigned i = 0; i < entry_count; i++) {465if (entries[i].flags & IR3_REG_SHARED)466ctx.entries[ctx.entry_count++] = entries[i];467}468_handle_copies(v->shader->compiler, instr, &ctx);469470if (v->mergedregs) {471/* Half regs and full regs are in the same file, so handle everything472* at once.473*/474ctx.entry_count = 0;475for (unsigned i = 0; i < entry_count; i++) {476if (!(entries[i].flags & IR3_REG_SHARED))477ctx.entries[ctx.entry_count++] = entries[i];478}479_handle_copies(v->shader->compiler, instr, &ctx);480} else {481/* There may be both half copies and full copies, so we have to split482* them up since they don't interfere.483*/484ctx.entry_count = 0;485for (unsigned i = 0; i < entry_count; i++) {486if (entries[i].flags & IR3_REG_HALF)487ctx.entries[ctx.entry_count++] = entries[i];488}489_handle_copies(v->shader->compiler, instr, &ctx);490491ctx.entry_count = 0;492for (unsigned i = 0; i < entry_count; i++) {493if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))494ctx.entries[ctx.entry_count++] = entries[i];495}496_handle_copies(v->shader->compiler, instr, &ctx);497}498}499500void501ir3_lower_copies(struct ir3_shader_variant *v)502{503DECLARE_ARRAY(struct copy_entry, copies);504copies_count = copies_sz = 0;505copies = NULL;506507foreach_block (block, &v->ir->block_list) {508foreach_instr_safe (instr, &block->instr_list) {509if (instr->opc == OPC_META_PARALLEL_COPY) {510copies_count = 0;511for (unsigned i = 0; i < instr->dsts_count; i++) {512struct ir3_register *dst = instr->dsts[i];513struct ir3_register *src = instr->srcs[i];514unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);515unsigned dst_physreg = ra_reg_get_physreg(dst);516for (unsigned j = 0; j < reg_elems(dst); j++) {517array_insert(518NULL, copies,519(struct copy_entry){520.dst = dst_physreg + j * reg_elem_size(dst),521.src = get_copy_src(src, j * reg_elem_size(dst)),522.flags = flags,523});524}525}526handle_copies(v, instr, copies, copies_count);527list_del(&instr->node);528} else if (instr->opc == OPC_META_COLLECT) {529copies_count = 0;530struct ir3_register *dst = instr->dsts[0];531unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);532for (unsigned i = 0; i < instr->srcs_count; i++) {533struct ir3_register *src = instr->srcs[i];534array_insert(NULL, copies,535(struct copy_entry){536.dst = ra_num_to_physreg(dst->num + i, flags),537.src = get_copy_src(src, 0),538.flags = flags,539});540}541handle_copies(v, instr, copies, copies_count);542list_del(&instr->node);543} else if (instr->opc == OPC_META_SPLIT) {544copies_count = 0;545struct ir3_register *dst = instr->dsts[0];546struct ir3_register *src = instr->srcs[0];547unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);548array_insert(NULL, copies,549(struct copy_entry){550.dst = ra_reg_get_physreg(dst),551.src = get_copy_src(552src, instr->split.off * reg_elem_size(dst)),553.flags = flags,554});555handle_copies(v, instr, copies, copies_count);556list_del(&instr->node);557} else if (instr->opc == OPC_META_PHI) {558list_del(&instr->node);559}560}561}562563if (copies)564ralloc_free(copies);565}566567568