Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_schedule.c
4570 views
/*1* Copyright © 2010 Intel Corporation2* Copyright © 2014 Broadcom3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324/**25* @file vc4_qpu_schedule.c26*27* The basic model of the list scheduler is to take a basic block, compute a28* DAG of the dependencies, and make a list of the DAG heads. Heuristically29* pick a DAG head, then put all the children that are now DAG heads into the30* list of things to schedule.31*32* The goal of scheduling here is to pack pairs of operations together in a33* single QPU instruction.34*/3536#include "vc4_qir.h"37#include "vc4_qpu.h"38#include "util/ralloc.h"39#include "util/dag.h"4041static bool debug;4243struct schedule_node_child;4445struct schedule_node {46struct dag_node dag;47struct list_head link;48struct queued_qpu_inst *inst;4950/* Longest cycles + instruction_latency() of any parent of this node. */51uint32_t unblocked_time;5253/**54* Minimum number of cycles from scheduling this instruction until the55* end of the program, based on the slowest dependency chain through56* the children.57*/58uint32_t delay;5960/**61* cycles between this instruction being scheduled and when its result62* can be consumed.63*/64uint32_t latency;6566/**67* Which uniform from uniform_data[] this instruction read, or -1 if68* not reading a uniform.69*/70int uniform;71};7273/* When walking the instructions in reverse, we need to swap before/after in74* add_dep().75*/76enum direction { F, R };7778struct schedule_state {79struct dag *dag;80struct schedule_node *last_r[6];81struct schedule_node *last_ra[32];82struct schedule_node *last_rb[32];83struct schedule_node *last_sf;84struct schedule_node *last_vpm_read;85struct schedule_node *last_tmu_write;86struct schedule_node *last_tlb;87struct schedule_node *last_vpm;88struct schedule_node *last_uniforms_reset;89enum direction dir;90/* Estimated cycle when the current instruction would start. */91uint32_t time;92};9394static void95add_dep(struct schedule_state *state,96struct schedule_node *before,97struct schedule_node *after,98bool write)99{100bool write_after_read = !write && state->dir == R;101void *edge_data = (void *)(uintptr_t)write_after_read;102103if (!before || !after)104return;105106assert(before != after);107108if (state->dir == F)109dag_add_edge(&before->dag, &after->dag, edge_data);110else111dag_add_edge(&after->dag, &before->dag, edge_data);112}113114static void115add_read_dep(struct schedule_state *state,116struct schedule_node *before,117struct schedule_node *after)118{119add_dep(state, before, after, false);120}121122static void123add_write_dep(struct schedule_state *state,124struct schedule_node **before,125struct schedule_node *after)126{127add_dep(state, *before, after, true);128*before = after;129}130131static bool132qpu_writes_r4(uint64_t inst)133{134uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);135136switch(sig) {137case QPU_SIG_COLOR_LOAD:138case QPU_SIG_LOAD_TMU0:139case QPU_SIG_LOAD_TMU1:140case QPU_SIG_ALPHA_MASK_LOAD:141return true;142default:143return false;144}145}146147static void148process_raddr_deps(struct schedule_state *state, struct schedule_node *n,149uint32_t raddr, bool is_a)150{151switch (raddr) {152case QPU_R_VARY:153add_write_dep(state, &state->last_r[5], n);154break;155156case QPU_R_VPM:157add_write_dep(state, &state->last_vpm_read, n);158break;159160case QPU_R_UNIF:161add_read_dep(state, state->last_uniforms_reset, n);162break;163164case QPU_R_NOP:165case QPU_R_ELEM_QPU:166case QPU_R_XY_PIXEL_COORD:167case QPU_R_MS_REV_FLAGS:168break;169170default:171if (raddr < 32) {172if (is_a)173add_read_dep(state, state->last_ra[raddr], n);174else175add_read_dep(state, state->last_rb[raddr], n);176} else {177fprintf(stderr, "unknown raddr %d\n", raddr);178abort();179}180break;181}182}183184static bool185is_tmu_write(uint32_t waddr)186{187switch (waddr) {188case QPU_W_TMU0_S:189case QPU_W_TMU0_T:190case QPU_W_TMU0_R:191case QPU_W_TMU0_B:192case QPU_W_TMU1_S:193case QPU_W_TMU1_T:194case QPU_W_TMU1_R:195case QPU_W_TMU1_B:196return true;197default:198return false;199}200}201202static bool203reads_uniform(uint64_t inst)204{205if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)206return false;207208return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||209(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&210QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||211is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||212is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));213}214215static void216process_mux_deps(struct schedule_state *state, struct schedule_node *n,217uint32_t mux)218{219if (mux != QPU_MUX_A && mux != QPU_MUX_B)220add_read_dep(state, state->last_r[mux], n);221}222223224static void225process_waddr_deps(struct schedule_state *state, struct schedule_node *n,226uint32_t waddr, bool is_add)227{228uint64_t inst = n->inst->inst;229bool is_a = is_add ^ ((inst & QPU_WS) != 0);230231if (waddr < 32) {232if (is_a) {233add_write_dep(state, &state->last_ra[waddr], n);234} else {235add_write_dep(state, &state->last_rb[waddr], n);236}237} else if (is_tmu_write(waddr)) {238add_write_dep(state, &state->last_tmu_write, n);239add_read_dep(state, state->last_uniforms_reset, n);240} else if (qpu_waddr_is_tlb(waddr) ||241waddr == QPU_W_MS_FLAGS) {242add_write_dep(state, &state->last_tlb, n);243} else {244switch (waddr) {245case QPU_W_ACC0:246case QPU_W_ACC1:247case QPU_W_ACC2:248case QPU_W_ACC3:249case QPU_W_ACC5:250add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],251n);252break;253254case QPU_W_VPM:255add_write_dep(state, &state->last_vpm, n);256break;257258case QPU_W_VPMVCD_SETUP:259if (is_a)260add_write_dep(state, &state->last_vpm_read, n);261else262add_write_dep(state, &state->last_vpm, n);263break;264265case QPU_W_SFU_RECIP:266case QPU_W_SFU_RECIPSQRT:267case QPU_W_SFU_EXP:268case QPU_W_SFU_LOG:269add_write_dep(state, &state->last_r[4], n);270break;271272case QPU_W_TLB_STENCIL_SETUP:273/* This isn't a TLB operation that does things like274* implicitly lock the scoreboard, but it does have to275* appear before TLB_Z, and each of the TLB_STENCILs276* have to schedule in the same order relative to each277* other.278*/279add_write_dep(state, &state->last_tlb, n);280break;281282case QPU_W_MS_FLAGS:283add_write_dep(state, &state->last_tlb, n);284break;285286case QPU_W_UNIFORMS_ADDRESS:287add_write_dep(state, &state->last_uniforms_reset, n);288break;289290case QPU_W_NOP:291break;292293default:294fprintf(stderr, "Unknown waddr %d\n", waddr);295abort();296}297}298}299300static void301process_cond_deps(struct schedule_state *state, struct schedule_node *n,302uint32_t cond)303{304switch (cond) {305case QPU_COND_NEVER:306case QPU_COND_ALWAYS:307break;308default:309add_read_dep(state, state->last_sf, n);310break;311}312}313314/**315* Common code for dependencies that need to be tracked both forward and316* backward.317*318* This is for things like "all reads of r4 have to happen between the r4319* writes that surround them".320*/321static void322calculate_deps(struct schedule_state *state, struct schedule_node *n)323{324uint64_t inst = n->inst->inst;325uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);326uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);327uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);328uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);329uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);330uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);331uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);332uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);333uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);334uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);335uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);336337if (sig != QPU_SIG_LOAD_IMM) {338process_raddr_deps(state, n, raddr_a, true);339if (sig != QPU_SIG_SMALL_IMM &&340sig != QPU_SIG_BRANCH)341process_raddr_deps(state, n, raddr_b, false);342}343344if (add_op != QPU_A_NOP) {345process_mux_deps(state, n, add_a);346process_mux_deps(state, n, add_b);347}348if (mul_op != QPU_M_NOP) {349process_mux_deps(state, n, mul_a);350process_mux_deps(state, n, mul_b);351}352353process_waddr_deps(state, n, waddr_add, true);354process_waddr_deps(state, n, waddr_mul, false);355if (qpu_writes_r4(inst))356add_write_dep(state, &state->last_r[4], n);357358switch (sig) {359case QPU_SIG_SW_BREAKPOINT:360case QPU_SIG_NONE:361case QPU_SIG_SMALL_IMM:362case QPU_SIG_LOAD_IMM:363break;364365case QPU_SIG_THREAD_SWITCH:366case QPU_SIG_LAST_THREAD_SWITCH:367/* All accumulator contents and flags are undefined after the368* switch.369*/370for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)371add_write_dep(state, &state->last_r[i], n);372add_write_dep(state, &state->last_sf, n);373374/* Scoreboard-locking operations have to stay after the last375* thread switch.376*/377add_write_dep(state, &state->last_tlb, n);378379add_write_dep(state, &state->last_tmu_write, n);380break;381382case QPU_SIG_LOAD_TMU0:383case QPU_SIG_LOAD_TMU1:384/* TMU loads are coming from a FIFO, so ordering is important.385*/386add_write_dep(state, &state->last_tmu_write, n);387break;388389case QPU_SIG_COLOR_LOAD:390add_read_dep(state, state->last_tlb, n);391break;392393case QPU_SIG_BRANCH:394add_read_dep(state, state->last_sf, n);395break;396397case QPU_SIG_PROG_END:398case QPU_SIG_WAIT_FOR_SCOREBOARD:399case QPU_SIG_SCOREBOARD_UNLOCK:400case QPU_SIG_COVERAGE_LOAD:401case QPU_SIG_COLOR_LOAD_END:402case QPU_SIG_ALPHA_MASK_LOAD:403fprintf(stderr, "Unhandled signal bits %d\n", sig);404abort();405}406407process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));408process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));409if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)410add_write_dep(state, &state->last_sf, n);411}412413static void414calculate_forward_deps(struct vc4_compile *c, struct dag *dag,415struct list_head *schedule_list)416{417struct schedule_state state;418419memset(&state, 0, sizeof(state));420state.dag = dag;421state.dir = F;422423list_for_each_entry(struct schedule_node, node, schedule_list, link)424calculate_deps(&state, node);425}426427static void428calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,429struct list_head *schedule_list)430{431struct schedule_state state;432433memset(&state, 0, sizeof(state));434state.dag = dag;435state.dir = R;436437list_for_each_entry_rev(struct schedule_node, node, schedule_list,438link) {439calculate_deps(&state, (struct schedule_node *)node);440}441}442443struct choose_scoreboard {444struct dag *dag;445int tick;446int last_sfu_write_tick;447int last_uniforms_reset_tick;448uint32_t last_waddr_a, last_waddr_b;449bool tlb_locked;450};451452static bool453reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)454{455uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);456uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);457uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);458459/* Full immediate loads don't read any registers. */460if (sig == QPU_SIG_LOAD_IMM)461return false;462463uint32_t src_muxes[] = {464QPU_GET_FIELD(inst, QPU_ADD_A),465QPU_GET_FIELD(inst, QPU_ADD_B),466QPU_GET_FIELD(inst, QPU_MUL_A),467QPU_GET_FIELD(inst, QPU_MUL_B),468};469for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {470if ((src_muxes[i] == QPU_MUX_A &&471raddr_a < 32 &&472scoreboard->last_waddr_a == raddr_a) ||473(src_muxes[i] == QPU_MUX_B &&474sig != QPU_SIG_SMALL_IMM &&475raddr_b < 32 &&476scoreboard->last_waddr_b == raddr_b)) {477return true;478}479480if (src_muxes[i] == QPU_MUX_R4) {481if (scoreboard->tick -482scoreboard->last_sfu_write_tick <= 2) {483return true;484}485}486}487488if (sig == QPU_SIG_SMALL_IMM &&489QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {490uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);491uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);492493if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||494scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||495scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||496scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {497return true;498}499}500501if (reads_uniform(inst) &&502scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {503return true;504}505506return false;507}508509static bool510pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)511{512return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));513}514515static int516get_instruction_priority(uint64_t inst)517{518uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);519uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);520uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);521uint32_t baseline_score;522uint32_t next_score = 0;523524/* Schedule TLB operations as late as possible, to get more525* parallelism between shaders.526*/527if (qpu_inst_is_tlb(inst))528return next_score;529next_score++;530531/* Schedule texture read results collection late to hide latency. */532if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)533return next_score;534next_score++;535536/* Default score for things that aren't otherwise special. */537baseline_score = next_score;538next_score++;539540/* Schedule texture read setup early to hide their latency better. */541if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))542return next_score;543next_score++;544545return baseline_score;546}547548static struct schedule_node *549choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,550struct list_head *schedule_list,551struct schedule_node *prev_inst)552{553struct schedule_node *chosen = NULL;554int chosen_prio = 0;555556/* Don't pair up anything with a thread switch signal -- emit_thrsw()557* will handle pairing it along with filling the delay slots.558*/559if (prev_inst) {560uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,561QPU_SIG);562if (prev_sig == QPU_SIG_THREAD_SWITCH ||563prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {564return NULL;565}566}567568list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,569dag.link) {570uint64_t inst = n->inst->inst;571uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);572573/* Don't choose the branch instruction until it's the last one574* left. XXX: We could potentially choose it before it's the575* last one, if the remaining instructions fit in the delay576* slots.577*/578if (sig == QPU_SIG_BRANCH &&579!list_is_singular(&scoreboard->dag->heads)) {580continue;581}582583/* "An instruction must not read from a location in physical584* regfile A or B that was written to by the previous585* instruction."586*/587if (reads_too_soon_after_write(scoreboard, inst))588continue;589590/* "A scoreboard wait must not occur in the first two591* instructions of a fragment shader. This is either the592* explicit Wait for Scoreboard signal or an implicit wait593* with the first tile-buffer read or write instruction."594*/595if (pixel_scoreboard_too_soon(scoreboard, inst))596continue;597598/* If we're trying to pair with another instruction, check599* that they're compatible.600*/601if (prev_inst) {602/* Don't pair up a thread switch signal -- we'll603* handle pairing it when we pick it on its own.604*/605if (sig == QPU_SIG_THREAD_SWITCH ||606sig == QPU_SIG_LAST_THREAD_SWITCH) {607continue;608}609610if (prev_inst->uniform != -1 && n->uniform != -1)611continue;612613/* Don't merge in something that will lock the TLB.614* Hopefully what we have in inst will release some615* other instructions, allowing us to delay the616* TLB-locking instruction until later.617*/618if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))619continue;620621inst = qpu_merge_inst(prev_inst->inst->inst, inst);622if (!inst)623continue;624}625626int prio = get_instruction_priority(inst);627628/* Found a valid instruction. If nothing better comes along,629* this one works.630*/631if (!chosen) {632chosen = n;633chosen_prio = prio;634continue;635}636637if (prio > chosen_prio) {638chosen = n;639chosen_prio = prio;640} else if (prio < chosen_prio) {641continue;642}643644if (n->delay > chosen->delay) {645chosen = n;646chosen_prio = prio;647} else if (n->delay < chosen->delay) {648continue;649}650}651652return chosen;653}654655static void656update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,657uint64_t inst)658{659uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);660uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);661662if (!(inst & QPU_WS)) {663scoreboard->last_waddr_a = waddr_add;664scoreboard->last_waddr_b = waddr_mul;665} else {666scoreboard->last_waddr_b = waddr_add;667scoreboard->last_waddr_a = waddr_mul;668}669670if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||671(waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {672scoreboard->last_sfu_write_tick = scoreboard->tick;673}674675if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||676waddr_mul == QPU_W_UNIFORMS_ADDRESS) {677scoreboard->last_uniforms_reset_tick = scoreboard->tick;678}679680if (qpu_inst_is_tlb(inst))681scoreboard->tlb_locked = true;682}683684static void685dump_state(struct dag *dag)686{687list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {688fprintf(stderr, " t=%4d: ", n->unblocked_time);689vc4_qpu_disasm(&n->inst->inst, 1);690fprintf(stderr, "\n");691692util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {693struct schedule_node *child =694(struct schedule_node *)edge->child;695if (!child)696continue;697698fprintf(stderr, " - ");699vc4_qpu_disasm(&child->inst->inst, 1);700fprintf(stderr, " (%d parents, %c)\n",701child->dag.parent_count,702edge->data ? 'w' : 'r');703}704}705}706707static uint32_t waddr_latency(uint32_t waddr, uint64_t after)708{709if (waddr < 32)710return 2;711712/* Apply some huge latency between texture fetch requests and getting713* their results back.714*715* FIXME: This is actually pretty bogus. If we do:716*717* mov tmu0_s, a718* <a bit of math>719* mov tmu0_s, b720* load_tmu0721* <more math>722* load_tmu0723*724* we count that as worse than725*726* mov tmu0_s, a727* mov tmu0_s, b728* <lots of math>729* load_tmu0730* <more math>731* load_tmu0732*733* because we associate the first load_tmu0 with the *second* tmu0_s.734*/735if (waddr == QPU_W_TMU0_S) {736if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)737return 100;738}739if (waddr == QPU_W_TMU1_S) {740if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)741return 100;742}743744switch(waddr) {745case QPU_W_SFU_RECIP:746case QPU_W_SFU_RECIPSQRT:747case QPU_W_SFU_EXP:748case QPU_W_SFU_LOG:749return 3;750default:751return 1;752}753}754755static uint32_t756instruction_latency(struct schedule_node *before, struct schedule_node *after)757{758uint64_t before_inst = before->inst->inst;759uint64_t after_inst = after->inst->inst;760761return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),762after_inst),763waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),764after_inst));765}766767/** Recursive computation of the delay member of a node. */768static void769compute_delay(struct dag_node *node, void *state)770{771struct schedule_node *n = (struct schedule_node *)node;772773n->delay = 1;774775util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {776struct schedule_node *child =777(struct schedule_node *)edge->child;778n->delay = MAX2(n->delay, (child->delay +779instruction_latency(n, child)));780}781}782783/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()784* should be called on it later to finish pruning the other edges).785*/786static void787pre_remove_head(struct dag *dag, struct schedule_node *n)788{789list_delinit(&n->dag.link);790791util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {792if (edge->data)793dag_remove_edge(dag, edge);794}795}796797static void798mark_instruction_scheduled(struct dag *dag,799uint32_t time,800struct schedule_node *node)801{802if (!node)803return;804805util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {806struct schedule_node *child =807(struct schedule_node *)edge->child;808809if (!child)810continue;811812uint32_t latency = instruction_latency(node, child);813814child->unblocked_time = MAX2(child->unblocked_time,815time + latency);816}817dag_prune_head(dag, &node->dag);818}819820/**821* Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair822* with another instruction.823*/824static void825emit_thrsw(struct vc4_compile *c,826struct choose_scoreboard *scoreboard,827uint64_t inst)828{829uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);830831/* There should be nothing in a thrsw inst being scheduled other than832* the signal bits.833*/834assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);835assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);836837/* Try to find an earlier scheduled instruction that we can merge the838* thrsw into.839*/840int thrsw_ip = c->qpu_inst_count;841for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {842uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];843uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);844845if (prev_sig == QPU_SIG_NONE)846thrsw_ip = c->qpu_inst_count - i;847}848849if (thrsw_ip != c->qpu_inst_count) {850/* Merge the thrsw into the existing instruction. */851c->qpu_insts[thrsw_ip] =852QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);853} else {854qpu_serialize_one_inst(c, inst);855update_scoreboard_for_chosen(scoreboard, inst);856}857858/* Fill the delay slots. */859while (c->qpu_inst_count < thrsw_ip + 3) {860update_scoreboard_for_chosen(scoreboard, qpu_NOP());861qpu_serialize_one_inst(c, qpu_NOP());862}863}864865static uint32_t866schedule_instructions(struct vc4_compile *c,867struct choose_scoreboard *scoreboard,868struct qblock *block,869struct list_head *schedule_list,870enum quniform_contents *orig_uniform_contents,871uint32_t *orig_uniform_data,872uint32_t *next_uniform)873{874uint32_t time = 0;875876while (!list_is_empty(&scoreboard->dag->heads)) {877struct schedule_node *chosen =878choose_instruction_to_schedule(scoreboard,879schedule_list,880NULL);881struct schedule_node *merge = NULL;882883/* If there are no valid instructions to schedule, drop a NOP884* in.885*/886uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();887888if (debug) {889fprintf(stderr, "t=%4d: current list:\n",890time);891dump_state(scoreboard->dag);892fprintf(stderr, "t=%4d: chose: ", time);893vc4_qpu_disasm(&inst, 1);894fprintf(stderr, "\n");895}896897/* Schedule this instruction onto the QPU list. Also try to898* find an instruction to pair with it.899*/900if (chosen) {901time = MAX2(chosen->unblocked_time, time);902pre_remove_head(scoreboard->dag, chosen);903if (chosen->uniform != -1) {904c->uniform_data[*next_uniform] =905orig_uniform_data[chosen->uniform];906c->uniform_contents[*next_uniform] =907orig_uniform_contents[chosen->uniform];908(*next_uniform)++;909}910911merge = choose_instruction_to_schedule(scoreboard,912schedule_list,913chosen);914if (merge) {915time = MAX2(merge->unblocked_time, time);916inst = qpu_merge_inst(inst, merge->inst->inst);917assert(inst != 0);918if (merge->uniform != -1) {919c->uniform_data[*next_uniform] =920orig_uniform_data[merge->uniform];921c->uniform_contents[*next_uniform] =922orig_uniform_contents[merge->uniform];923(*next_uniform)++;924}925926if (debug) {927fprintf(stderr, "t=%4d: merging: ",928time);929vc4_qpu_disasm(&merge->inst->inst, 1);930fprintf(stderr, "\n");931fprintf(stderr, " resulting in: ");932vc4_qpu_disasm(&inst, 1);933fprintf(stderr, "\n");934}935}936}937938if (debug) {939fprintf(stderr, "\n");940}941942/* Now that we've scheduled a new instruction, some of its943* children can be promoted to the list of instructions ready to944* be scheduled. Update the children's unblocked time for this945* DAG edge as we do so.946*/947mark_instruction_scheduled(scoreboard->dag, time, chosen);948mark_instruction_scheduled(scoreboard->dag, time, merge);949950if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||951QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {952emit_thrsw(c, scoreboard, inst);953} else {954qpu_serialize_one_inst(c, inst);955update_scoreboard_for_chosen(scoreboard, inst);956}957958scoreboard->tick++;959time++;960961if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {962block->branch_qpu_ip = c->qpu_inst_count - 1;963/* Fill the delay slots.964*965* We should fill these with actual instructions,966* instead, but that will probably need to be done967* after this, once we know what the leading968* instructions of the successors are (so we can969* handle A/B register file write latency)970*/971inst = qpu_NOP();972update_scoreboard_for_chosen(scoreboard, inst);973qpu_serialize_one_inst(c, inst);974qpu_serialize_one_inst(c, inst);975qpu_serialize_one_inst(c, inst);976}977}978979return time;980}981982static uint32_t983qpu_schedule_instructions_block(struct vc4_compile *c,984struct choose_scoreboard *scoreboard,985struct qblock *block,986enum quniform_contents *orig_uniform_contents,987uint32_t *orig_uniform_data,988uint32_t *next_uniform)989{990scoreboard->dag = dag_create(NULL);991struct list_head setup_list;992993list_inithead(&setup_list);994995/* Wrap each instruction in a scheduler structure. */996uint32_t next_sched_uniform = *next_uniform;997while (!list_is_empty(&block->qpu_inst_list)) {998struct queued_qpu_inst *inst =999(struct queued_qpu_inst *)block->qpu_inst_list.next;1000struct schedule_node *n = rzalloc(scoreboard->dag,1001struct schedule_node);10021003dag_init_node(scoreboard->dag, &n->dag);1004n->inst = inst;10051006if (reads_uniform(inst->inst)) {1007n->uniform = next_sched_uniform++;1008} else {1009n->uniform = -1;1010}1011list_del(&inst->link);1012list_addtail(&n->link, &setup_list);1013}10141015calculate_forward_deps(c, scoreboard->dag, &setup_list);1016calculate_reverse_deps(c, scoreboard->dag, &setup_list);10171018dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);10191020uint32_t cycles = schedule_instructions(c, scoreboard, block,1021&setup_list,1022orig_uniform_contents,1023orig_uniform_data,1024next_uniform);10251026ralloc_free(scoreboard->dag);1027scoreboard->dag = NULL;10281029return cycles;1030}10311032static void1033qpu_set_branch_targets(struct vc4_compile *c)1034{1035qir_for_each_block(block, c) {1036/* The end block of the program has no branch. */1037if (!block->successors[0])1038continue;10391040/* If there was no branch instruction, then the successor1041* block must follow immediately after this one.1042*/1043if (block->branch_qpu_ip == ~0) {1044assert(block->end_qpu_ip + 1 ==1045block->successors[0]->start_qpu_ip);1046continue;1047}10481049/* Set the branch target for the block that doesn't follow1050* immediately after ours.1051*/1052uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];1053assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);1054assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);10551056uint32_t branch_target =1057(block->successors[0]->start_qpu_ip -1058(block->branch_qpu_ip + 4)) * sizeof(uint64_t);1059*branch_inst = (*branch_inst |1060QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));10611062/* Make sure that the if-we-don't-jump successor was scheduled1063* just after the delay slots.1064*/1065if (block->successors[1]) {1066assert(block->successors[1]->start_qpu_ip ==1067block->branch_qpu_ip + 4);1068}1069}1070}10711072uint32_t1073qpu_schedule_instructions(struct vc4_compile *c)1074{1075/* We reorder the uniforms as we schedule instructions, so save the1076* old data off and replace it.1077*/1078uint32_t *uniform_data = c->uniform_data;1079enum quniform_contents *uniform_contents = c->uniform_contents;1080c->uniform_contents = ralloc_array(c, enum quniform_contents,1081c->num_uniforms);1082c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);1083c->uniform_array_size = c->num_uniforms;1084uint32_t next_uniform = 0;10851086struct choose_scoreboard scoreboard;1087memset(&scoreboard, 0, sizeof(scoreboard));1088scoreboard.last_waddr_a = ~0;1089scoreboard.last_waddr_b = ~0;1090scoreboard.last_sfu_write_tick = -10;1091scoreboard.last_uniforms_reset_tick = -10;10921093if (debug) {1094fprintf(stderr, "Pre-schedule instructions\n");1095qir_for_each_block(block, c) {1096fprintf(stderr, "BLOCK %d\n", block->index);1097list_for_each_entry(struct queued_qpu_inst, q,1098&block->qpu_inst_list, link) {1099vc4_qpu_disasm(&q->inst, 1);1100fprintf(stderr, "\n");1101}1102}1103fprintf(stderr, "\n");1104}11051106uint32_t cycles = 0;1107qir_for_each_block(block, c) {1108block->start_qpu_ip = c->qpu_inst_count;1109block->branch_qpu_ip = ~0;11101111cycles += qpu_schedule_instructions_block(c,1112&scoreboard,1113block,1114uniform_contents,1115uniform_data,1116&next_uniform);11171118block->end_qpu_ip = c->qpu_inst_count - 1;1119}11201121qpu_set_branch_targets(c);11221123assert(next_uniform == c->num_uniforms);11241125if (debug) {1126fprintf(stderr, "Post-schedule instructions\n");1127vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);1128fprintf(stderr, "\n");1129}11301131return cycles;1132}113311341135