Path: blob/21.2-virgl/src/freedreno/ir3/ir3_postsched.c
4565 views
/*1* Copyright (C) 2019 Google, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include "util/dag.h"27#include "util/u_math.h"2829#include "ir3.h"30#include "ir3_compiler.h"31#include "ir3_context.h"3233#ifdef DEBUG34#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)35#else36#define SCHED_DEBUG 037#endif38#define d(fmt, ...) \39do { \40if (SCHED_DEBUG) { \41printf("PSCHED: " fmt "\n", ##__VA_ARGS__); \42} \43} while (0)4445#define di(instr, fmt, ...) \46do { \47if (SCHED_DEBUG) { \48printf("PSCHED: " fmt ": ", ##__VA_ARGS__); \49ir3_print_instr(instr); \50} \51} while (0)5253/*54* Post RA Instruction Scheduling55*/5657struct ir3_postsched_ctx {58struct ir3 *ir;5960struct ir3_shader_variant *v;6162void *mem_ctx;63struct ir3_block *block; /* the current block */64struct dag *dag;6566struct list_head unscheduled_list; /* unscheduled instructions */6768int sfu_delay;69int tex_delay;70};7172struct ir3_postsched_node {73struct dag_node dag; /* must be first for util_dynarray_foreach */74struct ir3_instruction *instr;75bool partially_evaluated_path;7677bool has_tex_src, has_sfu_src;7879unsigned delay;80unsigned max_delay;81};8283#define foreach_sched_node(__n, __list) \84list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)8586static bool87has_tex_src(struct ir3_instruction *instr)88{89struct ir3_postsched_node *node = instr->data;90return node->has_tex_src;91}9293static bool94has_sfu_src(struct ir3_instruction *instr)95{96struct ir3_postsched_node *node = instr->data;97return node->has_sfu_src;98}99100static void101schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)102{103debug_assert(ctx->block == instr->block);104105/* remove from unscheduled_list:106*/107list_delinit(&instr->node);108109di(instr, "schedule");110111list_addtail(&instr->node, &instr->block->instr_list);112113struct ir3_postsched_node *n = instr->data;114dag_prune_head(ctx->dag, &n->dag);115116if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))117return;118119if (is_sfu(instr)) {120ctx->sfu_delay = 8;121} else if (has_sfu_src(instr)) {122ctx->sfu_delay = 0;123} else if (ctx->sfu_delay > 0) {124ctx->sfu_delay--;125}126127if (is_tex_or_prefetch(instr)) {128ctx->tex_delay = 10;129} else if (has_tex_src(instr)) {130ctx->tex_delay = 0;131} else if (ctx->tex_delay > 0) {132ctx->tex_delay--;133}134}135136static void137dump_state(struct ir3_postsched_ctx *ctx)138{139if (!SCHED_DEBUG)140return;141142foreach_sched_node (n, &ctx->dag->heads) {143di(n->instr, "maxdel=%3d ", n->max_delay);144145util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {146struct ir3_postsched_node *child =147(struct ir3_postsched_node *)edge->child;148149di(child->instr, " -> (%d parents) ", child->dag.parent_count);150}151}152}153154/* Determine if this is an instruction that we'd prefer not to schedule155* yet, in order to avoid an (ss) sync. This is limited by the sfu_delay156* counter, ie. the more cycles it has been since the last SFU, the less157* costly a sync would be.158*/159static bool160would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)161{162if (ctx->sfu_delay) {163if (has_sfu_src(instr))164return true;165}166167if (ctx->tex_delay) {168if (has_tex_src(instr))169return true;170}171172return false;173}174175/* find instruction to schedule: */176static struct ir3_instruction *177choose_instr(struct ir3_postsched_ctx *ctx)178{179struct ir3_postsched_node *chosen = NULL;180181dump_state(ctx);182183foreach_sched_node (n, &ctx->dag->heads) {184if (!is_meta(n->instr))185continue;186187if (!chosen || (chosen->max_delay < n->max_delay))188chosen = n;189}190191if (chosen) {192di(chosen->instr, "prio: chose (meta)");193return chosen->instr;194}195196/* Try to schedule inputs with a higher priority, if possible, as197* the last bary.f unlocks varying storage to unblock more VS198* warps.199*/200foreach_sched_node (n, &ctx->dag->heads) {201if (!is_input(n->instr))202continue;203204if (!chosen || (chosen->max_delay < n->max_delay))205chosen = n;206}207208if (chosen) {209di(chosen->instr, "prio: chose (input)");210return chosen->instr;211}212213/* Next prioritize discards: */214foreach_sched_node (n, &ctx->dag->heads) {215unsigned d =216ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);217218if (d > 0)219continue;220221if (!is_kill_or_demote(n->instr))222continue;223224if (!chosen || (chosen->max_delay < n->max_delay))225chosen = n;226}227228if (chosen) {229di(chosen->instr, "csp: chose (kill, hard ready)");230return chosen->instr;231}232233/* Next prioritize expensive instructions: */234foreach_sched_node (n, &ctx->dag->heads) {235unsigned d =236ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);237238if (d > 0)239continue;240241if (!(is_sfu(n->instr) || is_tex(n->instr)))242continue;243244if (!chosen || (chosen->max_delay < n->max_delay))245chosen = n;246}247248if (chosen) {249di(chosen->instr, "csp: chose (sfu/tex, hard ready)");250return chosen->instr;251}252253/*254* Sometimes be better to take a nop, rather than scheduling an255* instruction that would require an (ss) shortly after another256* SFU.. ie. if last SFU was just one or two instr ago, and we257* could choose between taking a nop and then scheduling258* something else, vs scheduling the immed avail instruction that259* would require (ss), we are better with the nop.260*/261for (unsigned delay = 0; delay < 4; delay++) {262foreach_sched_node (n, &ctx->dag->heads) {263if (would_sync(ctx, n->instr))264continue;265266unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,267ctx->v->mergedregs);268269if (d > delay)270continue;271272if (!chosen || (chosen->max_delay < n->max_delay))273chosen = n;274}275276if (chosen) {277di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);278return chosen->instr;279}280}281282/* Next try to find a ready leader w/ soft delay (ie. including extra283* delay for things like tex fetch which can be synchronized w/ sync284* bit (but we probably do want to schedule some other instructions285* while we wait)286*/287foreach_sched_node (n, &ctx->dag->heads) {288unsigned d =289ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);290291if (d > 0)292continue;293294if (!chosen || (chosen->max_delay < n->max_delay))295chosen = n;296}297298if (chosen) {299di(chosen->instr, "csp: chose (soft ready)");300return chosen->instr;301}302303/* Next try to find a ready leader that can be scheduled without nop's,304* which in the case of things that need (sy)/(ss) could result in305* stalls.. but we've already decided there is not a better option.306*/307foreach_sched_node (n, &ctx->dag->heads) {308unsigned d =309ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);310311if (d > 0)312continue;313314if (!chosen || (chosen->max_delay < n->max_delay))315chosen = n;316}317318if (chosen) {319di(chosen->instr, "csp: chose (hard ready)");320return chosen->instr;321}322323/* Otherwise choose leader with maximum cost:324*325* TODO should we try to balance cost and delays? I guess it is326* a balance between now-nop's and future-nop's?327*/328foreach_sched_node (n, &ctx->dag->heads) {329if (!chosen || chosen->max_delay < n->max_delay)330chosen = n;331}332333if (chosen) {334di(chosen->instr, "csp: chose (leader)");335return chosen->instr;336}337338return NULL;339}340341struct ir3_postsched_deps_state {342struct ir3_postsched_ctx *ctx;343344enum { F, R } direction;345346bool merged;347348/* Track the mapping between sched node (instruction) that last349* wrote a given register (in whichever direction we are iterating350* the block)351*352* Note, this table is twice as big as the # of regs, to deal with353* half-precision regs. The approach differs depending on whether354* the half and full precision register files are "merged" (conflict,355* ie. a6xx+) in which case we consider each full precision dep356* as two half-precision dependencies, vs older separate (non-357* conflicting) in which case the first half of the table is used358* for full precision and 2nd half for half-precision.359*/360struct ir3_postsched_node *regs[2 * 256];361};362363/* bounds checking read/write accessors, since OoB access to stuff on364* the stack is gonna cause a bad day.365*/366#define dep_reg(state, idx) \367*({ \368assert((idx) < ARRAY_SIZE((state)->regs)); \369&(state)->regs[(idx)]; \370})371372static void373add_dep(struct ir3_postsched_deps_state *state,374struct ir3_postsched_node *before, struct ir3_postsched_node *after)375{376if (!before || !after)377return;378379assert(before != after);380381if (state->direction == F) {382dag_add_edge(&before->dag, &after->dag, NULL);383} else {384dag_add_edge(&after->dag, &before->dag, NULL);385}386}387388static void389add_single_reg_dep(struct ir3_postsched_deps_state *state,390struct ir3_postsched_node *node, unsigned num, int src_n)391{392struct ir3_postsched_node *dep = dep_reg(state, num);393394if (src_n >= 0 && dep && state->direction == F) {395unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);396node->delay = MAX2(node->delay, d);397if (is_tex_or_prefetch(dep->instr))398node->has_tex_src = true;399if (is_tex_or_prefetch(dep->instr))400node->has_sfu_src = true;401}402403add_dep(state, dep, node);404if (src_n < 0) {405dep_reg(state, num) = node;406}407}408409/* This is where we handled full vs half-precision, and potential conflicts410* between half and full precision that result in additional dependencies.411* The 'reg' arg is really just to know half vs full precision.412*413* If non-negative, then this adds a dependency on a source register, and414* src_n is the index passed into ir3_delayslots() for calculating the delay:415* If positive, corresponds to node->instr->regs[src_n]. If negative, then416* this is for a destination register.417*/418static void419add_reg_dep(struct ir3_postsched_deps_state *state,420struct ir3_postsched_node *node, const struct ir3_register *reg,421unsigned num, int src_n)422{423if (state->merged) {424/* Make sure that special registers like a0.x that are written as425* half-registers don't alias random full registers by pretending that426* they're full registers:427*/428if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {429/* single conflict in half-reg space: */430add_single_reg_dep(state, node, num, src_n);431} else {432/* two conflicts in half-reg space: */433add_single_reg_dep(state, node, 2 * num + 0, src_n);434add_single_reg_dep(state, node, 2 * num + 1, src_n);435}436} else {437if (reg->flags & IR3_REG_HALF)438num += ARRAY_SIZE(state->regs) / 2;439add_single_reg_dep(state, node, num, src_n);440}441}442443static void444calculate_deps(struct ir3_postsched_deps_state *state,445struct ir3_postsched_node *node)446{447/* Add dependencies on instructions that previously (or next,448* in the reverse direction) wrote any of our src registers:449*/450foreach_src_n (reg, i, node->instr) {451if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))452continue;453454if (reg->flags & IR3_REG_RELATIV) {455/* mark entire array as read: */456for (unsigned j = 0; j < reg->size; j++) {457add_reg_dep(state, node, reg, reg->array.base + j, i);458}459} else {460assert(reg->wrmask >= 1);461u_foreach_bit (b, reg->wrmask) {462add_reg_dep(state, node, reg, reg->num + b, i);463}464}465}466467/* And then after we update the state for what this instruction468* wrote:469*/470foreach_dst (reg, node->instr) {471if (reg->wrmask == 0)472continue;473if (reg->flags & IR3_REG_RELATIV) {474/* mark the entire array as written: */475for (unsigned i = 0; i < reg->size; i++) {476add_reg_dep(state, node, reg, reg->array.base + i, -1);477}478} else {479assert(reg->wrmask >= 1);480u_foreach_bit (b, reg->wrmask) {481add_reg_dep(state, node, reg, reg->num + b, -1);482}483}484}485}486487static void488calculate_forward_deps(struct ir3_postsched_ctx *ctx)489{490struct ir3_postsched_deps_state state = {491.ctx = ctx,492.direction = F,493.merged = ctx->v->mergedregs,494};495496foreach_instr (instr, &ctx->unscheduled_list) {497calculate_deps(&state, instr->data);498}499}500501static void502calculate_reverse_deps(struct ir3_postsched_ctx *ctx)503{504struct ir3_postsched_deps_state state = {505.ctx = ctx,506.direction = R,507.merged = ctx->v->mergedregs,508};509510foreach_instr_rev (instr, &ctx->unscheduled_list) {511calculate_deps(&state, instr->data);512}513}514515static void516sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)517{518struct ir3_postsched_node *n =519rzalloc(ctx->mem_ctx, struct ir3_postsched_node);520521dag_init_node(ctx->dag, &n->dag);522523n->instr = instr;524instr->data = n;525}526527static void528sched_dag_max_delay_cb(struct dag_node *node, void *state)529{530struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;531uint32_t max_delay = 0;532533util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {534struct ir3_postsched_node *child =535(struct ir3_postsched_node *)edge->child;536max_delay = MAX2(child->max_delay, max_delay);537}538539n->max_delay = MAX2(n->max_delay, max_delay + n->delay);540}541542static void543sched_dag_init(struct ir3_postsched_ctx *ctx)544{545ctx->mem_ctx = ralloc_context(NULL);546547ctx->dag = dag_create(ctx->mem_ctx);548549foreach_instr (instr, &ctx->unscheduled_list)550sched_node_init(ctx, instr);551552calculate_forward_deps(ctx);553calculate_reverse_deps(ctx);554555/*556* To avoid expensive texture fetches, etc, from being moved ahead557* of kills, track the kills we've seen so far, so we can add an558* extra dependency on them for tex/mem instructions559*/560struct util_dynarray kills;561util_dynarray_init(&kills, ctx->mem_ctx);562563/* The last bary.f with the (ei) flag must be scheduled before any kills,564* or the hw gets angry. Keep track of inputs here so we can add the565* false dep on the kill instruction.566*/567struct util_dynarray inputs;568util_dynarray_init(&inputs, ctx->mem_ctx);569570/*571* Normal srcs won't be in SSA at this point, those are dealt with in572* calculate_forward_deps() and calculate_reverse_deps(). But we still573* have the false-dep information in SSA form, so go ahead and add574* dependencies for that here:575*/576foreach_instr (instr, &ctx->unscheduled_list) {577struct ir3_postsched_node *n = instr->data;578579foreach_ssa_src_n (src, i, instr) {580if (src->block != instr->block)581continue;582583/* we can end up with unused false-deps.. just skip them: */584if (src->flags & IR3_INSTR_UNUSED)585continue;586587struct ir3_postsched_node *sn = src->data;588589/* don't consider dependencies in other blocks: */590if (src->block != instr->block)591continue;592593dag_add_edge(&sn->dag, &n->dag, NULL);594}595596if (is_input(instr)) {597util_dynarray_append(&inputs, struct ir3_instruction *, instr);598} else if (is_kill_or_demote(instr)) {599util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {600struct ir3_instruction *input = *instrp;601struct ir3_postsched_node *in = input->data;602dag_add_edge(&in->dag, &n->dag, NULL);603}604util_dynarray_append(&kills, struct ir3_instruction *, instr);605} else if (is_tex(instr) || is_mem(instr)) {606util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {607struct ir3_instruction *kill = *instrp;608struct ir3_postsched_node *kn = kill->data;609dag_add_edge(&kn->dag, &n->dag, NULL);610}611}612}613614// TODO do we want to do this after reverse-dependencies?615dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);616}617618static void619sched_dag_destroy(struct ir3_postsched_ctx *ctx)620{621ralloc_free(ctx->mem_ctx);622ctx->mem_ctx = NULL;623ctx->dag = NULL;624}625626static void627sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)628{629ctx->block = block;630ctx->tex_delay = 0;631ctx->sfu_delay = 0;632633/* move all instructions to the unscheduled list, and634* empty the block's instruction list (to which we will635* be inserting).636*/637list_replace(&block->instr_list, &ctx->unscheduled_list);638list_inithead(&block->instr_list);639640// TODO once we are using post-sched for everything we can641// just not stick in NOP's prior to post-sched, and drop this.642// for now keep this, since it makes post-sched optional:643foreach_instr_safe (instr, &ctx->unscheduled_list) {644switch (instr->opc) {645case OPC_NOP:646case OPC_B:647case OPC_JUMP:648list_delinit(&instr->node);649break;650default:651break;652}653}654655sched_dag_init(ctx);656657/* First schedule all meta:input instructions, followed by658* tex-prefetch. We want all of the instructions that load659* values into registers before the shader starts to go660* before any other instructions. But in particular we661* want inputs to come before prefetches. This is because662* a FS's bary_ij input may not actually be live in the663* shader, but it should not be scheduled on top of any664* other input (but can be overwritten by a tex prefetch)665*/666foreach_instr_safe (instr, &ctx->unscheduled_list)667if (instr->opc == OPC_META_INPUT)668schedule(ctx, instr);669670foreach_instr_safe (instr, &ctx->unscheduled_list)671if (instr->opc == OPC_META_TEX_PREFETCH)672schedule(ctx, instr);673674while (!list_is_empty(&ctx->unscheduled_list)) {675struct ir3_instruction *instr = choose_instr(ctx);676677unsigned delay =678ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);679d("delay=%u", delay);680681/* and if we run out of instructions that can be scheduled,682* then it is time for nop's:683*/684debug_assert(delay <= 6);685while (delay > 0) {686ir3_NOP(block);687delay--;688}689690schedule(ctx, instr);691}692693sched_dag_destroy(ctx);694}695696static bool697is_self_mov(struct ir3_instruction *instr)698{699if (!is_same_type_mov(instr))700return false;701702if (instr->dsts[0]->num != instr->srcs[0]->num)703return false;704705if (instr->dsts[0]->flags & IR3_REG_RELATIV)706return false;707708if (instr->cat1.round != ROUND_ZERO)709return false;710711if (instr->srcs[0]->flags &712(IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |713IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))714return false;715716return true;717}718719/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y720* as a result of places were before RA we are not sure that it is721* safe to eliminate. We could eliminate these earlier, but sometimes722* they are tangled up in false-dep's, etc, so it is easier just to723* let them exist until after RA724*/725static void726cleanup_self_movs(struct ir3 *ir)727{728foreach_block (block, &ir->block_list) {729foreach_instr_safe (instr, &block->instr_list) {730for (unsigned i = 0; i < instr->deps_count; i++) {731if (instr->deps[i] && is_self_mov(instr->deps[i])) {732instr->deps[i] = NULL;733}734}735736if (is_self_mov(instr))737list_delinit(&instr->node);738}739}740}741742bool743ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)744{745struct ir3_postsched_ctx ctx = {746.ir = ir,747.v = v,748};749750ir3_remove_nops(ir);751cleanup_self_movs(ir);752753foreach_block (block, &ir->block_list) {754sched_block(&ctx, block);755}756757return true;758}759760761