Path: blob/21.2-virgl/src/gallium/drivers/lima/ir/gp/scheduler.c
4574 views
/*1* Copyright (c) 2017 Lima Project2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sub license,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the11* next paragraph) shall be included in all copies or substantial portions12* of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER20* DEALINGS IN THE SOFTWARE.21*22*/2324#include <limits.h>2526#include "gpir.h"2728/*29* GP scheduling algorithm (by Connor Abbott <[email protected]>)30*31* The GP pipeline has three main stages:32*33* --------------------------------------------------------34* | |35* | Register/Attr/Temp Fetch |36* | |37* --------------------------------------------------------38* | | | | | | |39* | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass |40* | | | | | | |41* --------------------------------------------------------42* | | | |43* | Complex1 | Temp/Register/Varying | Pass |44* | Stage 2 | Store | Stage 2 |45* | | | |46* --------------------------------------------------------47*48* Because of this setup, storing a register has a latency of three cycles.49* Also, the register file is organized into 4-component vectors, and the50* load stage can only load two vectors at a time. Aside from these highly51* constrained register load/store units, there is an explicit bypass52* network, where each unit (mul0/mul1/etc.) can access the results of the53* any unit from the previous two cycles directly, except for the complex54* unit whose result can only be accessed for one cycle (since it's expected55* to be used directly by the complex2 instruction in the following cycle).56*57* Because of the very restricted register file, and because only rarely are58* all the units in use at the same time, it can be very beneficial to use59* the unused units to "thread" a value from source to destination by using60* moves in the otherwise-unused units, without involving the register file61* at all. It's very difficult to fully exploit this with a traditional62* scheduler, so we need to do something a little un-traditional. The 51263* instruction limit means that for more complex shaders, we need to do as64* well as possible or else the app won't even work.65*66* The scheduler works by considering the bypass network as a kind of67* register file. It's a quite unusual register file, since registers have to68* be assigned "on the fly" as we schedule operations, but with some care, we69* can use something conceptually similar to a linear-scan allocator to70* successfully schedule nodes to instructions without running into71* conflicts.72*73* Values in the IR are separated into normal values, or "value registers",74* which is what normal nodes like add, mul, etc. produce, and which only75* live inside one basic block, and registers, which can span multiple basic76* blocks but have to be accessed via special load_reg/store_reg nodes. RA77* assigns physical registers to both value registers and normal registers,78* treating load_reg/store_reg as a move instruction, but these are only used79* directly for normal registers -- the physreg assigned to a value register80* is "fake," and is only used inside the scheduler. Before scheduling we81* insert read-after-write dependencies, even for value registers, as if82* we're going to use those, but then we throw them away. For example, if we83* had something like:84*85* (*)r2 = add (*)r1, (*)r286* (*)r1 = load_reg r087*88* we'd insert a write-after-read dependency between the add and load_reg,89* even though the starred registers aren't actually used by the scheduler90* after this step. This step is crucial since it guarantees that during any91* point in the schedule, the number of live registers + live value registers92* will never exceed the capacity of the register file and the bypass network93* combined. This is because each live register/value register will have a94* different fake number, thanks to the fake dependencies inserted before95* scheduling. This allows us to not have to worry about spilling to96* temporaries, which is only done ahead of time.97*98* The scheduler is a bottom-up scheduler. It keeps track of each live value99* register, and decides on-the-fly which value registers to keep in the100* bypass network and which to "spill" to registers. Of particular importance101* is the "ready list," which consists of "input nodes" (nodes that produce a102* value that can be consumed via the bypass network), both "partially ready"103* (only some of the uses have been scheduled) and "fully ready" (all uses104* have been scheduled), as well as other non-input nodes like register105* stores. Each input node on the ready list represents a live value register106* before the current instruction. There must be at most 11 such input nodes107* at all times, since there are only 11 slots in the next two instructions108* which can reach the current instruction.109*110* An input node is a "max node" if it has a use two cycles ago, which must be111* connected to a definition this cycle. Otherwise it may be a "next max node"112* if it will be a max node on the next instruction (i.e. it has a use at most113* one cycle ago), or it may be neither if all of its uses are this cycle. As114* we keep adding instructions to the front, input nodes graduate from115* neither, to next max, to max, unless we decide to insert a move to keep it116* alive longer, at which point any uses after the current instruction are117* rewritten to be uses of the move so that the original node returns to118* neither. The scheduler decides which nodes to try freely, but we have to119* reserve slots for two different reasons: (1) out of the 5 non-complex120* slots, we reserve a slot for each max node, so that we can connect a121* definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a122* slot for every next-max node above 5, so that for the next instruction123* there are no more than 5 max nodes. When a max or next-max node gets124* scheduled, the corresponding reservation is reduced by one. At the end, we125* insert moves for every slot that was reserved. The reservation is actually126* managed by nir_instr, and all we have to do is tell it how many to reserve127* at the beginning and then tell it which nodes are max/next-max nodes. When128* we start scheduling an instruction, there will be at most 5 max nodes129* thanks to the previous instruction's next-max reservation/move insertion.130* Since there are at most 11 total input nodes, if there are N max nodes,131* there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =132* 6 - N slots need to be reserved for next-max nodes, and so at most133* 6 - N + N = 6 slots need to be reserved in total, exactly the total number134* of slots. So, thanks to the total input node restriction, we will never135* need to reserve too many slots.136*137* It sometimes happens that scheduling a given node will violate this total138* input node restriction, or that a reservation will mean that we can't139* schedule it. We first schedule a node "speculatively" to see if this is a140* problem. If some of the node's sources are loads, then we can schedule141* the node and its dependent loads in one swoop to avoid going over the142* pressure limit. If that fails, we can try to spill a ready or143* partially-ready input node to a register by rewriting all of its uses to144* refer to a register load. This removes it from the list of ready and145* partially ready input nodes as all of its uses are now unscheduled. If146* successful, we can then proceed with scheduling the original node. All of147* this happens "speculatively," meaning that afterwards the node is removed148* and the entire state of the scheduler is reverted to before it was tried, to149* ensure that we never get into an invalid state and run out of spots for150* moves. In try_nodes(), we try to schedule each node speculatively on the151* ready list, keeping only the nodes that could be successfully scheduled, so152* that when we finally decide which node to actually schedule, we know it153* will succeed. This is how we decide on the fly which values go in154* registers and which go in the bypass network. Note that "unspilling" a155* value is simply a matter of scheduling the store_reg instruction created156* when we spill.157*158* The careful accounting of live value registers, reservations for moves, and159* speculative scheduling guarantee that we never run into a failure case160* while scheduling. However, we need to make sure that this scheduler will161* not get stuck in an infinite loop, i.e. that we'll always make forward162* progress by eventually scheduling a non-move node. If we run out of value163* registers, then we may have to spill a node to a register. If we164* were to schedule one of the fully-ready nodes, then we'd have 11 + N live165* value registers before the current instruction. But since there are at most166* 64+11 live registers and register values total thanks to the fake167* dependencies we inserted before scheduling, there are at most 64 - N live168* physical registers, and therefore there are at least N registers available169* for spilling. Not all these registers will be available immediately, since170* in order to spill a node to a given register we have to ensure that there171* are slots available to rewrite every use to a load instruction, and that172* may not be the case. There may also be intervening writes which prevent173* some registers from being used. However, these are all temporary problems,174* since as we create each instruction, we create additional register load175* slots that can be freely used for spilling, and we create more move nodes176* which means that the uses of the nodes we're trying to spill keep moving177* forward. This means that eventually, these problems will go away, at which178* point we'll be able to spill a node successfully, so eventually we'll be179* able to schedule the first node on the ready list.180*/181182typedef struct {183/* This is the list of ready and partially-ready nodes. A partially-ready184* node must have at least one input dependency already scheduled.185*/186struct list_head ready_list;187188/* The number of ready or partially-ready nodes with at least one input189* dependency already scheduled. In other words, the number of live value190* registers. This must be at most 11.191*/192int ready_list_slots;193194/* The physical registers live into the current instruction. */195uint64_t live_physregs;196197/* The current instruction. */198gpir_instr *instr;199200/* The current basic block. */201gpir_block *block;202203/* True if at least one node failed to schedule due to lack of available204* value registers.205*/206bool try_spill_all;207208/* The number of max nodes needed to spill to successfully schedule the209* instruction.210*/211int max_node_spill_needed;212213/* The number of max and next-max nodes needed to spill to successfully214* schedule the instruction.215*/216int total_spill_needed;217218/* For each physical register, a linked list of loads associated with it in219* this block. When we spill a value to a given register, and there are220* existing loads associated with it that haven't been scheduled yet, we221* have to make sure that the corresponding unspill happens after the last222* original use has happened, i.e. is scheduled before.223*/224struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];225} sched_ctx;226227static int gpir_min_dist_alu(gpir_dep *dep)228{229switch (dep->pred->op) {230case gpir_op_load_uniform:231case gpir_op_load_temp:232case gpir_op_load_reg:233case gpir_op_load_attribute:234return 0;235236case gpir_op_complex1:237return 2;238239default:240return 1;241}242}243244static int gpir_get_min_dist(gpir_dep *dep)245{246switch (dep->type) {247case GPIR_DEP_INPUT:248switch (dep->succ->op) {249case gpir_op_store_temp:250case gpir_op_store_reg:251case gpir_op_store_varying:252/* Stores must use an alu node as input. Also, complex1 takes two253* cycles, which means that its result cannot be stored to a register254* as part of the normal path, and therefore it must also have a move255* inserted.256*/257if (dep->pred->type == gpir_node_type_load ||258dep->pred->op == gpir_op_complex1)259return INT_MAX >> 2;260else261return 0;262263default:264return gpir_min_dist_alu(dep);265}266267case GPIR_DEP_OFFSET:268assert(dep->succ->op == gpir_op_store_temp);269return gpir_min_dist_alu(dep);270271case GPIR_DEP_READ_AFTER_WRITE:272if (dep->succ->op == gpir_op_load_temp &&273dep->pred->op == gpir_op_store_temp) {274return 4;275} else if (dep->succ->op == gpir_op_load_reg &&276dep->pred->op == gpir_op_store_reg) {277return 3;278} else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||279dep->pred->op == gpir_op_store_temp_load_off1 ||280dep->pred->op == gpir_op_store_temp_load_off2) &&281dep->succ->op == gpir_op_load_uniform) {282return 4;283} else {284/* Fake dependency */285return 0;286}287288case GPIR_DEP_WRITE_AFTER_READ:289return 0;290}291292return 0;293}294295static int gpir_max_dist_alu(gpir_dep *dep)296{297switch (dep->pred->op) {298case gpir_op_load_uniform:299case gpir_op_load_temp:300return 0;301case gpir_op_load_attribute:302return 1;303case gpir_op_load_reg:304if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||305dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)306return 0;307else308return 1;309case gpir_op_exp2_impl:310case gpir_op_log2_impl:311case gpir_op_rcp_impl:312case gpir_op_rsqrt_impl:313case gpir_op_store_temp_load_off0:314case gpir_op_store_temp_load_off1:315case gpir_op_store_temp_load_off2:316return 1;317case gpir_op_mov:318if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)319return 1;320else321return 2;322default:323return 2;324}325}326327static int gpir_get_max_dist(gpir_dep *dep)328{329switch (dep->type) {330case GPIR_DEP_INPUT:331switch (dep->succ->op) {332case gpir_op_store_temp:333case gpir_op_store_reg:334case gpir_op_store_varying:335return 0;336337default:338return gpir_max_dist_alu(dep);339}340341case GPIR_DEP_OFFSET:342assert(dep->succ->op == gpir_op_store_temp);343return gpir_max_dist_alu(dep);344345default:346return INT_MAX >> 2; /* Don't want to overflow... */347}348}349350static void schedule_update_distance(gpir_node *node)351{352if (gpir_node_is_leaf(node)) {353node->sched.dist = 0;354return;355}356357gpir_node_foreach_pred(node, dep) {358gpir_node *pred = dep->pred;359360if (pred->sched.dist < 0)361schedule_update_distance(pred);362363int dist = pred->sched.dist + gpir_min_dist_alu(dep);364if (node->sched.dist < dist)365node->sched.dist = dist;366}367}368369static bool gpir_is_input_node(gpir_node *node)370{371gpir_node_foreach_succ(node, dep) {372if (dep->type == GPIR_DEP_INPUT)373return true;374}375return false;376}377378379/* Get the number of slots required for a node on the ready list.380*/381static int gpir_get_slots_required(gpir_node *node)382{383if (!gpir_is_input_node(node))384return 0;385386/* Note that we assume every node only consumes one slot, even dual-slot387* instructions. While dual-slot instructions may consume more than one388* slot, we can always safely insert a move if it turns out that there389* isn't enough space for them. There's the risk that we get stuck in an390* infinite loop if all the fully ready nodes are dual-slot nodes, but we391* rely on spilling to registers to save us here.392*/393return 1;394}395396static void ASSERTED verify_ready_list(sched_ctx *ctx)397{398list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {399if (!gpir_is_input_node(node)) {400assert(node->sched.ready);401}402403if (node->sched.ready) {404/* Every successor must have been scheduled */405gpir_node_foreach_succ(node, dep) {406assert(dep->succ->sched.instr);407}408} else {409/* There must be at least one successor that's not scheduled. */410bool unscheduled = false;411gpir_node_foreach_succ(node, dep) {412unscheduled |= !(dep->succ->sched.instr);413}414415assert(unscheduled);416}417}418}419420static void schedule_insert_ready_list(sched_ctx *ctx,421gpir_node *insert_node)422{423/* if this node is fully ready or partially ready424* fully ready: all successors have been scheduled425* partially ready: part of input successors have been scheduled426*427* either fully ready or partially ready node need be inserted to428* the ready list, but we only schedule a move node for partially429* ready node.430*/431bool ready = true, insert = false;432gpir_node_foreach_succ(insert_node, dep) {433gpir_node *succ = dep->succ;434if (succ->sched.instr) {435if (dep->type == GPIR_DEP_INPUT)436insert = true;437}438else439ready = false;440}441442insert_node->sched.ready = ready;443/* for root node */444insert |= ready;445446if (!insert || insert_node->sched.inserted)447return;448449struct list_head *insert_pos = &ctx->ready_list;450list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {451if ((insert_node->sched.dist > node->sched.dist ||452gpir_op_infos[insert_node->op].schedule_first) &&453!gpir_op_infos[node->op].schedule_first) {454insert_pos = &node->list;455break;456}457}458459list_addtail(&insert_node->list, insert_pos);460insert_node->sched.inserted = true;461ctx->ready_list_slots += gpir_get_slots_required(insert_node);462}463464static int gpir_get_max_start(gpir_node *node)465{466int max_start = 0;467468/* find the max start instr constrainted by all successors */469gpir_node_foreach_succ(node, dep) {470gpir_node *succ = dep->succ;471if (!succ->sched.instr)472continue;473474int start = succ->sched.instr->index + gpir_get_min_dist(dep);475if (start > max_start)476max_start = start;477}478479return max_start;480}481482static int gpir_get_min_end(gpir_node *node)483{484int min_end = INT_MAX;485486/* find the min end instr constrainted by all successors */487gpir_node_foreach_succ(node, dep) {488gpir_node *succ = dep->succ;489if (!succ->sched.instr)490continue;491492int end = succ->sched.instr->index + gpir_get_max_dist(dep);493if (end < min_end)494min_end = end;495}496497return min_end;498}499500static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)501{502gpir_load_node *load = gpir_node_to_load(node);503504for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {505if (!instr->slots[i])506continue;507508gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);509if (load->node.op == iload->node.op &&510load->index == iload->index &&511load->component == iload->component)512return &iload->node;513}514return NULL;515}516517/* Simply place the node into the given instruction without trying to deal518* with liveness or the ready list. This will only fail if the instruction519* cannot be placed due to a lack of available slots. In addition to normal520* node placement, this is also used for placing loads when spilling to521* registers.522*/523static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)524{525if (node->type == gpir_node_type_load) {526gpir_node *load = gpir_sched_instr_has_load(instr, node);527if (load) {528/* This node may have a store as a successor, in which case we have to529* fail it exactly like below in order to later create a move node in530* between.531*/532if (instr->index < gpir_get_max_start(node))533return false;534535gpir_debug("same load %d in instr %d for node %d\n",536load->index, instr->index, node->index);537538/* not really merge two node, just fake scheduled same place */539node->sched.instr = load->sched.instr;540node->sched.pos = load->sched.pos;541return true;542}543}544545if (node->op == gpir_op_store_reg) {546/* This register may be loaded in the next basic block, in which case547* there still needs to be a 2 instruction gap. We do what the blob548* seems to do and simply disable stores in the last two instructions of549* the basic block.550*551* TODO: We may be able to do better than this, but we have to check552* first if storing a register works across branches.553*/554if (instr->index < 2)555return false;556}557558node->sched.instr = instr;559560int max_node_spill_needed = INT_MAX;561int total_spill_needed = INT_MAX;562int *slots = gpir_op_infos[node->op].slots;563for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {564node->sched.pos = slots[i];565if (instr->index >= gpir_get_max_start(node) &&566instr->index <= gpir_get_min_end(node) &&567gpir_instr_try_insert_node(instr, node))568return true;569if (ctx->instr->non_cplx_slot_difference ||570ctx->instr->slot_difference) {571/* If one of these fields is non-zero, then we could insert the node572* here after spilling. To get an accurate count of how many nodes we573* need to spill, we need to choose one of the positions where there574* were nonzero slot differences, preferably one with the smallest575* difference (so we don't have to spill as much).576*/577if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||578ctx->instr->slot_difference < total_spill_needed) {579max_node_spill_needed = ctx->instr->non_cplx_slot_difference;580total_spill_needed = ctx->instr->slot_difference;581}582}583}584585if (max_node_spill_needed != INT_MAX) {586/* Indicate how many spill nodes are needed. */587ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,588max_node_spill_needed);589ctx->total_spill_needed = MAX2(ctx->total_spill_needed,590total_spill_needed);591}592node->sched.instr = NULL;593node->sched.pos = -1;594return false;595}596597/* Try to place just the node given, updating the ready list. If "speculative"598* is true, then this is part of the pre-commit phase. If false, then we have599* committed to placing this node, so update liveness and ready list600* information.601*/602603static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,604bool speculative)605{606if (!_try_place_node(ctx, ctx->instr, node)) {607if (!speculative)608gpir_debug("failed to place %d\n", node->index);609return false;610}611612ctx->ready_list_slots -= gpir_get_slots_required(node);613614if (!speculative) {615gpir_debug("placed node %d\n", node->index);616617/* We assume here that writes are placed before reads. If this changes,618* then this needs to be updated.619*/620if (node->op == gpir_op_store_reg) {621gpir_store_node *store = gpir_node_to_store(node);622ctx->live_physregs &=623~(1ull << (4 * store->index + store->component));624if (store->child->sched.physreg_store == store)625store->child->sched.physreg_store = NULL;626}627628if (node->op == gpir_op_load_reg) {629gpir_load_node *load = gpir_node_to_load(node);630ctx->live_physregs |=631(1ull << (4 * load->index + load->component));632}633634list_del(&node->list);635list_add(&node->list, &ctx->block->node_list);636gpir_node_foreach_pred(node, dep) {637gpir_node *pred = dep->pred;638schedule_insert_ready_list(ctx, pred);639}640} else {641gpir_node_foreach_pred(node, dep) {642gpir_node *pred = dep->pred;643if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)644ctx->ready_list_slots += gpir_get_slots_required(pred);645}646}647648return true;649}650651/* Create a new node with "node" as the child, replace all uses of "node" with652* this new node, and replace "node" with it in the ready list.653*/654static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,655gpir_op op)656{657658gpir_alu_node *new_node = gpir_node_create(node->block, op);659if (unlikely(!new_node))660return NULL;661662new_node->children[0] = node;663new_node->num_child = 1;664665new_node->node.sched.instr = NULL;666new_node->node.sched.pos = -1;667new_node->node.sched.dist = node->sched.dist;668new_node->node.sched.max_node = node->sched.max_node;669new_node->node.sched.next_max_node = node->sched.next_max_node;670new_node->node.sched.complex_allowed = node->sched.complex_allowed;671672ctx->ready_list_slots--;673list_del(&node->list);674node->sched.max_node = false;675node->sched.next_max_node = false;676node->sched.ready = false;677node->sched.inserted = false;678gpir_node_replace_succ(&new_node->node, node);679gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);680schedule_insert_ready_list(ctx, &new_node->node);681return &new_node->node;682}683684static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)685{686gpir_node *move = create_replacement(ctx, node, gpir_op_mov);687gpir_debug("create move %d for %d\n", move->index, node->index);688return move;689}690691static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)692{693assert(node->op == gpir_op_complex1);694gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);695gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);696return postlog2;697}698699/* Once we schedule the successor, would the predecessor be fully ready? */700static bool pred_almost_ready(gpir_dep *dep)701{702bool fully_ready = true;703gpir_node_foreach_succ(dep->pred, other_dep) {704gpir_node *succ = other_dep->succ;705if (!succ->sched.instr && dep->succ != other_dep->succ) {706fully_ready = false;707break;708}709}710711return fully_ready;712}713714/* Recursively try to schedule a node and all its dependent nodes that can fit715* in the same instruction. There is a simple heuristic scoring system to try716* to group together nodes that load different components of the same input,717* to avoid bottlenecking for operations like matrix multiplies that are718* mostly input-bound.719*/720721static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)722{723if (!schedule_try_place_node(ctx, node, speculative))724return INT_MIN;725726int score = 0;727728gpir_node_foreach_pred(node, dep) {729if (dep->type != GPIR_DEP_INPUT)730continue;731732int pred_score = INT_MIN;733if (pred_almost_ready(dep)) {734if (dep->pred->type == gpir_node_type_load ||735node->type == gpir_node_type_store) {736pred_score = _schedule_try_node(ctx, dep->pred, speculative);737}738}739if (dep->pred->type == gpir_node_type_load ||740node->type == gpir_node_type_store) {741if (pred_score == INT_MIN) {742if (node->op == gpir_op_mov) {743/* The only moves on the ready list are for loads that we744* couldn't schedule immediately, as created below. If we745* couldn't schedule the load, there's no point scheduling746* the move. The normal move threading logic will ensure747* that another move is created if we're about to go too far748* from the uses of this move.749*/750assert(speculative);751return INT_MIN;752} else if (!speculative && dep->pred->type == gpir_node_type_load) {753/* We couldn't schedule the load right away, so it will have754* to happen in some earlier instruction and then be moved755* into a value register and threaded to the use by "node".756* We create the move right away, so that later we'll fail757* to schedule it if there isn't a slot for a move758* available.759*/760create_move(ctx, dep->pred);761}762/* Penalize nodes whose dependent ops we couldn't schedule.763*/764score--;765} else {766score += pred_score;767continue;768}769}770}771772return score;773}774775/* If we speculatively tried a node, undo everything.776*/777778static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)779{780gpir_instr_remove_node(ctx->instr, node);781782gpir_node_foreach_pred(node, dep) {783gpir_node *pred = dep->pred;784if (pred->sched.instr) {785schedule_undo_node(ctx, pred);786}787}788}789790/* Try to schedule a node. We also try to schedule any predecessors that can791* be part of the same instruction. If "speculative" is true, then we don't792* actually change any state, only returning the score were the node to be793* scheduled, with INT_MIN meaning "cannot be scheduled at all".794*/795static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)796{797int prev_slots = ctx->ready_list_slots;798799int score = _schedule_try_node(ctx, node, speculative);800801if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {802assert(speculative);803ctx->total_spill_needed = MAX2(ctx->total_spill_needed,804ctx->ready_list_slots - GPIR_VALUE_REG_NUM);805score = INT_MIN;806}807808if (speculative) {809ctx->ready_list_slots = prev_slots;810if (node->sched.instr)811schedule_undo_node(ctx, node);812}813814return score;815}816817/* This is called when we want to spill "node" by inserting loads at its uses.818* It returns all the possible registers we can use so that all the loads will819* successfully be inserted. Also return the first instruction we'll need to820* insert a load for.821*/822823static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,824int *min_index)825{826uint64_t available = ~0ull;827gpir_node_foreach_succ(node, dep) {828if (dep->type != GPIR_DEP_INPUT)829continue;830831gpir_node *use = dep->succ;832gpir_instr *instr = use->sched.instr;833834if (!instr) {835/* This use isn't scheduled, so no need to spill it. */836continue;837}838839if (use->type == gpir_node_type_store) {840/* We're trying to spill something that was recently stored... just841* bail out.842*/843return 0;844}845846if (use->op == gpir_op_mov && instr == ctx->instr) {847/* We try to spill the sources of this move, so we can free up space848* in the current instruction.849*850* TODO: should we go back further? It might let us schedule the851* write earlier in some cases, but then we might fail to spill.852*/853available &= get_available_regs(ctx, use, min_index);854} else {855if (instr->index < *min_index)856*min_index = instr->index;857858uint64_t use_available = 0;859860if (instr->reg0_use_count == 0)861use_available = ~0ull;862else if (!instr->reg0_is_attr)863use_available = 0xfull << (4 * instr->reg0_index);864865if (instr->reg1_use_count == 0)866use_available = ~0ull;867else868use_available |= 0xfull << (4 * instr->reg1_index);869870available &= use_available;871}872}873874return available;875}876877/* Using "min_index" returned by get_available_regs(), figure out which878* registers are killed by a write after or during the current instruction and879* hence we can't use for spilling. Writes that haven't been scheduled yet880* should be reflected in live_physregs.881*/882883static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)884{885uint64_t killed = 0;886887list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {888if (instr->index <= min_index)889break;890891for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;892slot++) {893if (!instr->slots[slot])894continue;895896gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);897if (store->node.op != gpir_op_store_reg)898continue;899900killed |= 1ull << (4 * store->index + store->component);901}902}903904return killed;905}906907/* Actually spill a node so that it is no longer in the ready list. Note that908* this must exactly follow the logic of get_available_regs() or else the909* loads could fail to schedule.910*/911912static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)913{914gpir_node_foreach_succ_safe(node, dep) {915if (dep->type != GPIR_DEP_INPUT)916continue;917918gpir_node *use = dep->succ;919gpir_instr *instr = use->sched.instr;920921if (!instr)922continue;923924if (use->op == gpir_op_mov && instr == ctx->instr) {925spill_node(ctx, use, store);926} else {927gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);928load->index = store->index;929load->component = store->component;930list_add(&load->node.list, &ctx->block->node_list);931gpir_node_replace_child(dep->succ, dep->pred, &load->node);932gpir_node_replace_pred(dep, &load->node);933gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);934gpir_debug("spilling use %d of node %d to load node %d\n",935use->index, node->index, load->node.index);936ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);937assert(result);938}939}940941if (node->op == gpir_op_mov) {942/* We replaced all the uses of the move, so it's dead now. */943gpir_instr_remove_node(node->sched.instr, node);944gpir_node_delete(node);945} else {946/* We deleted all the uses of the node except the store, so it's not947* live anymore.948*/949list_del(&node->list);950node->sched.inserted = false;951ctx->ready_list_slots--;952if (node->sched.max_node) {953node->sched.max_node = false;954ctx->instr->alu_num_slot_needed_by_max--;955}956if (node->sched.next_max_node) {957node->sched.next_max_node = false;958ctx->instr->alu_num_unscheduled_next_max--;959}960}961}962963static bool used_by_store(gpir_node *node, gpir_instr *instr)964{965gpir_node_foreach_succ(node, dep) {966if (dep->type != GPIR_DEP_INPUT)967continue;968969if (dep->succ->type == gpir_node_type_store &&970dep->succ->sched.instr == instr)971return true;972}973974return false;975}976977static gpir_node *consuming_postlog2(gpir_node *node)978{979if (node->op != gpir_op_complex1)980return NULL;981982gpir_node_foreach_succ(node, dep) {983if (dep->type != GPIR_DEP_INPUT)984continue;985if (dep->succ->op == gpir_op_postlog2)986return dep->succ;987else988return NULL;989}990991return NULL;992}993994static bool try_spill_node(sched_ctx *ctx, gpir_node *node)995{996assert(node->op != gpir_op_mov);997998if (used_by_store(node, ctx->instr))999return false;10001001gpir_debug("trying to spill %d\n", node->index);10021003int min_instr = INT_MAX;1004uint64_t available = get_available_regs(ctx, node, &min_instr);1005available &= ~get_killed_regs(ctx, min_instr);10061007if (node->sched.physreg_store) {1008gpir_store_node *store = node->sched.physreg_store;1009if (!(available & (1ull << (4 * store->index + store->component))))1010return false;1011} else {1012available &= ~ctx->live_physregs;10131014if (available == 0)1015return false;10161017/* Don't spill complex1 if it's used postlog2, turn the postlog2 into a1018* move, replace the complex1 with postlog2 and spill that instead. The1019* store needs a move anyways so the postlog2 is usually free.1020*/1021gpir_node *postlog2 = consuming_postlog2(node);1022if (postlog2) {1023postlog2->op = gpir_op_mov;1024node = create_postlog2(ctx, node);1025}10261027/* TODO: use a better heuristic for choosing an available register? */1028int physreg = ffsll(available) - 1;10291030ctx->live_physregs |= (1ull << physreg);10311032gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);1033store->index = physreg / 4;1034store->component = physreg % 4;1035store->child = node;1036store->node.sched.max_node = false;1037store->node.sched.next_max_node = false;1038store->node.sched.complex_allowed = false;1039store->node.sched.pos = -1;1040store->node.sched.instr = NULL;1041store->node.sched.inserted = false;1042store->node.sched.dist = node->sched.dist;1043if (node->op == gpir_op_complex1) {1044/* Complex1 cannot be directly stored, and has a latency of 2 */1045store->node.sched.dist += 2;1046}1047node->sched.physreg_store = store;1048gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);10491050list_for_each_entry(gpir_load_node, load,1051&ctx->physreg_reads[physreg], reg_link) {1052gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);1053if (load->node.sched.ready) {1054list_del(&load->node.list);1055load->node.sched.ready = false;1056}1057}10581059node->sched.ready = false;1060schedule_insert_ready_list(ctx, &store->node);1061}10621063gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,1064node->sched.physreg_store->index,1065"xyzw"[node->sched.physreg_store->component],1066node->sched.physreg_store->node.index);10671068spill_node(ctx, node, node->sched.physreg_store);10691070return true;1071}10721073static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)1074{1075/* First, try to spill max nodes. */1076list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {1077if (ctx->max_node_spill_needed <= 0)1078break;10791080/* orig_node is the node we're trying to schedule, so spilling it makes1081* no sense. Also don't try to spill any nodes in front of it, since1082* they might be scheduled instead.1083*/1084if (node == orig_node)1085break;10861087if (node->op == gpir_op_mov) {1088/* Don't try to spill loads, since that only adds another load and1089* store which is likely pointless.1090*/1091continue;1092}10931094if (!gpir_is_input_node(node) || !node->sched.max_node)1095continue;10961097if (try_spill_node(ctx, node)) {1098ctx->max_node_spill_needed--;1099ctx->total_spill_needed--;1100}1101}11021103/* Now, try to spill the remaining nodes. */1104list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {1105if (ctx->total_spill_needed <= 0)1106break;11071108if (node == orig_node)1109break;11101111if (node->op == gpir_op_mov)1112continue;11131114if (!gpir_is_input_node(node) ||1115!(node->sched.max_node || node->sched.next_max_node))1116continue;11171118if (try_spill_node(ctx, node))1119ctx->total_spill_needed--;1120}11211122return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;1123}11241125static int ASSERTED gpir_get_curr_ready_list_slots(sched_ctx *ctx)1126{1127int total = 0;1128list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1129total += gpir_get_slots_required(node);1130}11311132return total;1133}11341135/* What gpir_get_min_end() would return if node were replaced with a move1136* instruction not in the complex slot. Normally this is 2 + min_end, except1137* for some store instructions which must have the move node in the same1138* instruction.1139*/1140static int gpir_get_min_end_as_move(gpir_node *node)1141{1142int min = INT_MAX;1143gpir_node_foreach_succ(node, dep) {1144gpir_node *succ = dep->succ;1145if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {1146switch (succ->op) {1147case gpir_op_store_temp:1148case gpir_op_store_reg:1149case gpir_op_store_varying:1150continue;1151default:1152break;1153}1154if (min > succ->sched.instr->index + 2)1155min = succ->sched.instr->index + 2;1156}1157}1158return min;1159}11601161/* The second source for add0, add1, mul0, and mul1 units cannot be complex.1162* The hardware overwrites the add second sources with 0 and mul second1163* sources with 1. This can be a problem if we need to insert more next-max1164* moves but we only have values that can't use the complex unit for moves.1165*1166* Fortunately, we only need to insert a next-max move if there are more than1167* 5 next-max nodes, but there are only 4 sources in the previous instruction1168* that make values not complex-capable, which means there can be at most 41169* non-complex-capable values. Hence there will always be at least two values1170* that can be rewritten to use a move in the complex slot. However, we have1171* to be careful not to waste those values by putting both of them in a1172* non-complex slot. This is handled for us by gpir_instr, which will reject1173* such instructions. We just need to tell it which nodes can use complex, and1174* it will do the accounting to figure out what is safe.1175*/11761177static bool can_use_complex(gpir_node *node)1178{1179gpir_node_foreach_succ(node, dep) {1180if (dep->type != GPIR_DEP_INPUT)1181continue;11821183gpir_node *succ = dep->succ;1184if (succ->type != gpir_node_type_alu ||1185!succ->sched.instr)1186continue;11871188/* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}1189*/1190gpir_alu_node *alu = gpir_node_to_alu(succ);1191switch (alu->node.op) {1192case gpir_op_complex1:1193/* complex1 puts its third source in the fourth slot */1194if (alu->children[1] == node || alu->children[2] == node)1195return false;1196break;1197case gpir_op_complex2:1198/* complex2 has its source duplicated, since it actually takes two1199* sources but we only ever use it with both sources the same. Hence1200* its source can never be the complex slot.1201*/1202return false;1203case gpir_op_select:1204/* Select has its sources rearranged */1205if (alu->children[0] == node)1206return false;1207break;1208default:1209assert(alu->num_child <= 2);1210if (alu->num_child == 2 && alu->children[1] == node)1211return false;1212break;1213}1214}12151216return true;1217}12181219/* Initialize node->sched.max_node and node->sched.next_max_node for every1220* input node on the ready list. We should only need to do this once per1221* instruction, at the beginning, since we never add max nodes to the ready1222* list.1223*/12241225static void sched_find_max_nodes(sched_ctx *ctx)1226{1227ctx->instr->alu_num_unscheduled_next_max = 0;1228ctx->instr->alu_num_slot_needed_by_max = 0;12291230list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1231if (!gpir_is_input_node(node))1232continue;12331234int min_end_move = gpir_get_min_end_as_move(node);1235node->sched.max_node = (min_end_move == ctx->instr->index);1236node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);1237if (node->sched.next_max_node)1238node->sched.complex_allowed = can_use_complex(node);12391240if (node->sched.max_node)1241ctx->instr->alu_num_slot_needed_by_max++;1242if (node->sched.next_max_node)1243ctx->instr->alu_num_unscheduled_next_max++;1244}1245}12461247/* Verify the invariants described in gpir.h, as well as making sure the1248* counts are correct.1249*/1250static void ASSERTED verify_max_nodes(sched_ctx *ctx)1251{1252int alu_num_slot_needed_by_max = 0;1253int alu_num_unscheduled_next_max = 0;1254int alu_num_slot_needed_by_store = 0;1255int alu_num_slot_needed_by_non_cplx_store = 0;1256ASSERTED int alu_max_allowed_next_max = 5;12571258list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1259if (!gpir_is_input_node(node))1260continue;12611262if (node->sched.max_node)1263alu_num_slot_needed_by_max++;1264if (node->sched.next_max_node)1265alu_num_unscheduled_next_max++;1266if (used_by_store(node, ctx->instr)) {1267alu_num_slot_needed_by_store++;1268if (node->sched.next_max_node && !node->sched.complex_allowed)1269alu_num_slot_needed_by_non_cplx_store++;1270}1271}12721273if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&1274ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)1275alu_max_allowed_next_max = 4;12761277assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);1278assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);1279assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);1280assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);1281assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==1282alu_num_slot_needed_by_non_cplx_store);1283assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));1284assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);1285}12861287static bool try_node(sched_ctx *ctx)1288{1289gpir_node *best_node = NULL;1290int best_score = INT_MIN;12911292/* Spilling will delete arbitrary nodes after the current one in the ready1293* list, which means that we always need to look up the next node in the1294* list at the end of each iteration. While list_for_each_entry() works for1295* this purpose, its sanity checking assumes that you don't want to modify1296* the list at all. We know better here, so we have to open-code1297* list_for_each_entry() without the check in order to not assert.1298*/1299for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);1300&node->list != &ctx->ready_list;1301node = LIST_ENTRY(gpir_node, node->list.next, list)) {1302if (best_score != INT_MIN) {1303if (node->sched.dist < best_node->sched.dist)1304break;1305}13061307if (node->sched.ready) {1308ctx->total_spill_needed = 0;1309ctx->max_node_spill_needed = 0;1310int score = schedule_try_node(ctx, node, true);1311if (score == INT_MIN && !best_node &&1312ctx->total_spill_needed > 0 &&1313try_spill_nodes(ctx, node)) {1314score = schedule_try_node(ctx, node, true);1315}13161317/* schedule_first nodes must be scheduled if possible */1318if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {1319best_node = node;1320best_score = score;1321break;1322}13231324if (score > best_score) {1325best_score = score;1326best_node = node;1327}1328}1329}13301331if (best_node) {1332gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,1333best_score, best_node->sched.max_node ? " (max)" : "");1334ASSERTED int score = schedule_try_node(ctx, best_node, false);1335assert(score != INT_MIN);1336return true;1337}13381339return false;1340}13411342static void place_move(sched_ctx *ctx, gpir_node *node)1343{1344/* For complex1 that is consumed by a postlog2, we cannot allow any moves1345* in between. Convert the postlog2 to a move and insert a new postlog2,1346* and try to schedule it again in try_node().1347*/1348gpir_node *postlog2 = consuming_postlog2(node);1349if (postlog2) {1350postlog2->op = gpir_op_mov;1351create_postlog2(ctx, node);1352return;1353}13541355gpir_node *move = create_move(ctx, node);1356gpir_node_foreach_succ_safe(move, dep) {1357gpir_node *succ = dep->succ;1358if (!succ->sched.instr ||1359ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {1360gpir_node_replace_pred(dep, node);1361if (dep->type == GPIR_DEP_INPUT)1362gpir_node_replace_child(succ, move, node);1363}1364}1365ASSERTED int score = schedule_try_node(ctx, move, false);1366assert(score != INT_MIN);1367}13681369/* For next-max nodes, not every node can be offloaded to a move in the1370* complex slot. If we run out of non-complex slots, then such nodes cannot1371* have moves placed for them. There should always be sufficient1372* complex-capable nodes so that this isn't a problem. We also disallow moves1373* for schedule_first nodes here.1374*/1375static bool can_place_move(sched_ctx *ctx, gpir_node *node)1376{1377if (gpir_op_infos[node->op].schedule_first)1378return false;13791380if (!node->sched.next_max_node)1381return true;13821383if (node->sched.complex_allowed)1384return true;13851386return ctx->instr->alu_non_cplx_slot_free > 0;1387}13881389static bool sched_move(sched_ctx *ctx)1390{1391list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1392if (node->sched.max_node) {1393place_move(ctx, node);1394return true;1395}1396}13971398if (ctx->instr->alu_num_slot_needed_by_store > 0) {1399list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1400if (used_by_store(node, ctx->instr)) {1401place_move(ctx, node);1402/* If we have a store of a load, then we need to make sure that we1403* immediately schedule the dependent load, or create a move1404* instruction for it, like we would with a normal instruction.1405* The rest of the code isn't set up to handle load nodes in the1406* ready list -- see the comments in _schedule_try_node().1407*/1408if (node->type == gpir_node_type_load) {1409if (!schedule_try_place_node(ctx, node, false)) {1410create_move(ctx, node);1411}1412}1413return true;1414}1415}1416}14171418/* complex1 is a bit a special case, since it has a latency of 2 cycles.1419* Once it is fully ready, we need to group all its uses in the same1420* instruction, and then we need to avoid creating any moves in the next1421* cycle in order to get it scheduled. Failing to do any of these things1422* could result in a cycle penalty, or even worse, an infinite loop of1423* inserting moves. If it is a next-max node and ready, then it has a use1424* in the previous cycle. If it has a use in the current cycle as well,1425* then we want to insert a move node to make it ready in two cycles -- if1426* we don't, then there will be at least a one cycle penalty. Otherwise, it1427* will be ready next cycle, and we shouldn't insert a move node, or else1428* we'll also have a one cycle penalty.1429*/1430if (ctx->instr->alu_num_slot_free > 0) {1431list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1432if (!can_place_move(ctx, node))1433continue;14341435if (node->sched.next_max_node && node->op == gpir_op_complex1 &&1436node->sched.ready) {1437bool skip = true;1438gpir_node_foreach_succ(node, dep) {1439if (dep->type != GPIR_DEP_INPUT)1440continue;14411442gpir_node *succ = dep->succ;14431444if (!succ->sched.instr ||1445succ->sched.instr->index != ctx->instr->index - 1) {1446skip = false;1447break;1448}1449}14501451if (skip)1452continue;14531454place_move(ctx, node);1455return true;1456}1457}1458}14591460/* Once we've made all the required moves, we're free to use any extra1461* slots to schedule more moves for next max nodes. Besides sometimes being1462* necessary, this can free up extra space in the next instruction. We walk1463* from back to front so that we pick nodes less likely to be scheduled1464* next first -- an extra move would be unnecessary there. But make sure1465* not to handle the complex1 case handled above.1466*/1467if (ctx->instr->alu_num_slot_free > 0) {1468list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {1469if (!can_place_move(ctx, node))1470continue;14711472if (node->sched.next_max_node &&1473!(node->op == gpir_op_complex1 && node->sched.ready)) {1474place_move(ctx, node);1475return true;1476}1477}1478}14791480/* We may have skipped complex1 above, but if we run out of space, we still1481* need to insert the move.1482*/14831484if (ctx->instr->alu_num_unscheduled_next_max >1485ctx->instr->alu_max_allowed_next_max) {1486list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1487if (!can_place_move(ctx, node))1488continue;14891490if (node->sched.next_max_node) {1491place_move(ctx, node);1492return true;1493}1494}1495}149614971498return false;1499}15001501static bool gpir_sched_instr_pass(sched_ctx *ctx)1502{1503if (try_node(ctx))1504return true;15051506if (sched_move(ctx))1507return true;15081509return false;1510}15111512static void schedule_print_pre_one_instr(sched_ctx *ctx)1513{1514if (!(lima_debug & LIMA_DEBUG_GP))1515return;15161517printf("instr %d for ready list:", ctx->instr->index);1518list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {1519printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',1520node->sched.dist, gpir_get_slots_required(node),1521node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));1522}1523printf("\nlive physregs: ");1524for (unsigned i = 0; i < 16; i++) {1525if (ctx->live_physregs & (0xfull << (4 * i))) {1526printf("$%d.", i);1527for (unsigned j = 0; j < 4; j++) {1528if (ctx->live_physregs & (1ull << (4 * i + j)))1529printf("%c", "xyzw"[j]);1530}1531printf(" ");1532}1533}1534printf("\n");1535}15361537static void schedule_print_post_one_instr(gpir_instr *instr)1538{1539if (!(lima_debug & LIMA_DEBUG_GP))1540return;15411542printf("post schedule instr");1543for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {1544if (instr->slots[i])1545printf(" %d/%d", i, instr->slots[i]->index);1546}1547printf("\n");1548}154915501551static bool schedule_one_instr(sched_ctx *ctx)1552{1553gpir_instr *instr = gpir_instr_create(ctx->block);1554if (unlikely(!instr))1555return false;15561557ctx->instr = instr;15581559sched_find_max_nodes(ctx);1560schedule_print_pre_one_instr(ctx);15611562while (gpir_sched_instr_pass(ctx)) {1563assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));1564#ifndef NDEBUG1565verify_max_nodes(ctx);1566verify_ready_list(ctx);1567#endif1568}15691570schedule_print_post_one_instr(instr);1571return true;1572}15731574static bool schedule_block(gpir_block *block)1575{1576/* calculate distance */1577list_for_each_entry(gpir_node, node, &block->node_list, list) {1578if (gpir_node_is_root(node))1579schedule_update_distance(node);1580}15811582sched_ctx ctx;1583list_inithead(&ctx.ready_list);1584ctx.block = block;1585ctx.ready_list_slots = 0;1586ctx.live_physregs = block->live_out_phys;15871588for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {1589list_inithead(&ctx.physreg_reads[i]);1590}15911592/* construct the ready list from root nodes */1593list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {1594/* Add to physreg_reads */1595if (node->op == gpir_op_load_reg) {1596gpir_load_node *load = gpir_node_to_load(node);1597unsigned index = 4 * load->index + load->component;1598list_addtail(&load->reg_link, &ctx.physreg_reads[index]);1599}16001601if (gpir_node_is_root(node))1602schedule_insert_ready_list(&ctx, node);1603}16041605list_inithead(&block->node_list);1606while (!list_is_empty(&ctx.ready_list)) {1607if (!schedule_one_instr(&ctx))1608return false;1609}16101611return true;1612}16131614static void add_fake_dep(gpir_node *node, gpir_node *dep_node,1615gpir_node *last_written[])1616{1617gpir_node_foreach_pred(node, dep) {1618if (dep->type == GPIR_DEP_INPUT) {1619int index = dep->pred->value_reg;1620if (index >= 0 && last_written[index]) {1621gpir_node_add_dep(last_written[index], dep_node,1622GPIR_DEP_WRITE_AFTER_READ);1623}1624if (gpir_op_infos[dep->pred->op].schedule_first) {1625/* Insert fake dependencies for any schedule_first children on1626* this node as well. This guarantees that as soon as1627* "dep_node" is ready to schedule, all of its schedule_first1628* children, grandchildren, etc. are ready so that they can be1629* scheduled as soon as possible.1630*/1631add_fake_dep(dep->pred, dep_node, last_written);1632}1633}1634}1635}16361637static void schedule_build_dependency(gpir_block *block)1638{1639gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};16401641/* merge dummy_f/m to the node created from */1642list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {1643if (node->op == gpir_op_dummy_m) {1644gpir_alu_node *alu = gpir_node_to_alu(node);1645gpir_node *origin = alu->children[0];1646gpir_node *dummy_f = alu->children[1];16471648gpir_node_foreach_succ(node, dep) {1649gpir_node *succ = dep->succ;1650/* origin and node may have same succ (by VREG/INPUT or1651* VREG/VREG dep), so use gpir_node_add_dep() instead of1652* gpir_node_replace_pred() */1653gpir_node_add_dep(succ, origin, dep->type);1654gpir_node_replace_child(succ, node, origin);1655}1656gpir_node_delete(dummy_f);1657gpir_node_delete(node);1658}1659}16601661memset(last_written, 0, sizeof(last_written));16621663/* False dependencies. For value registers, these exist only to make sure1664* that the maximum pressure isn't exceeded and are hence "fake".1665*/1666list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {1667if (node->op == gpir_op_load_reg) {1668gpir_load_node *load = gpir_node_to_load(node);1669unsigned index = 4 * load->index + load->component;1670if (last_written[index]) {1671gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);1672}1673} else if (node->op == gpir_op_store_reg) {1674gpir_store_node *store = gpir_node_to_store(node);1675unsigned index = 4 * store->index + store->component;1676last_written[index] = node;1677} else {1678add_fake_dep(node, node, last_written);1679}16801681if (node->value_reg >= 0)1682last_written[node->value_reg] = node;1683}1684}16851686static void print_statistic(gpir_compiler *comp, int save_index)1687{1688int num_nodes[gpir_op_num] = {0};1689int num_created_nodes[gpir_op_num] = {0};16901691list_for_each_entry(gpir_block, block, &comp->block_list, list) {1692list_for_each_entry(gpir_node, node, &block->node_list, list) {1693num_nodes[node->op]++;1694if (node->index >= save_index)1695num_created_nodes[node->op]++;1696}1697}16981699printf("====== gpir scheduler statistic ======\n");1700printf("---- how many nodes are scheduled ----\n");1701int n = 0, l = 0;1702for (int i = 0; i < gpir_op_num; i++) {1703if (num_nodes[i]) {1704printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);1705n += num_nodes[i];1706if (!(++l % 4))1707printf("\n");1708}1709}1710if (l % 4)1711printf("\n");1712printf("\ntotal: %d\n", n);17131714printf("---- how many nodes are created ----\n");1715n = l = 0;1716for (int i = 0; i < gpir_op_num; i++) {1717if (num_created_nodes[i]) {1718printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);1719n += num_created_nodes[i];1720if (!(++l % 4))1721printf("\n");1722}1723}1724if (l % 4)1725printf("\n");1726printf("\ntotal: %d\n", n);1727printf("------------------------------------\n");1728}17291730bool gpir_schedule_prog(gpir_compiler *comp)1731{1732int save_index = comp->cur_index;17331734/* init schedule info */1735int index = 0;1736list_for_each_entry(gpir_block, block, &comp->block_list, list) {1737block->sched.instr_index = 0;1738list_for_each_entry(gpir_node, node, &block->node_list, list) {1739node->sched.instr = NULL;1740node->sched.pos = -1;1741node->sched.index = index++;1742node->sched.dist = -1;1743/* TODO when we support multiple basic blocks, we need a way to keep1744* track of this for physregs allocated before the scheduler.1745*/1746node->sched.physreg_store = NULL;1747node->sched.ready = false;1748node->sched.inserted = false;1749node->sched.complex_allowed = false;1750node->sched.max_node = false;1751node->sched.next_max_node = false;1752}1753}17541755/* build dependency */1756list_for_each_entry(gpir_block, block, &comp->block_list, list) {1757schedule_build_dependency(block);1758}17591760//gpir_debug("after scheduler build reg dependency\n");1761//gpir_node_print_prog_dep(comp);17621763list_for_each_entry(gpir_block, block, &comp->block_list, list) {1764if (!schedule_block(block)) {1765gpir_error("fail schedule block\n");1766return false;1767}1768}17691770if (lima_debug & LIMA_DEBUG_GP) {1771print_statistic(comp, save_index);1772gpir_instr_print_prog(comp);1773}17741775return true;1776}177717781779