Path: blob/21.2-virgl/src/panfrost/midgard/midgard_schedule.c
4564 views
/*1* Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>2* Copyright (C) 2019-2020 Collabora, Ltd.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,20* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "compiler.h"25#include "midgard_ops.h"26#include "midgard_quirks.h"27#include "util/u_memory.h"28#include "util/u_math.h"29#include "util/half_float.h"3031/* Scheduling for Midgard is complicated, to say the least. ALU instructions32* must be grouped into VLIW bundles according to following model:33*34* [VMUL] [SADD]35* [VADD] [SMUL] [VLUT]36*37* A given instruction can execute on some subset of the units (or a few can38* execute on all). Instructions can be either vector or scalar; only scalar39* instructions can execute on SADD/SMUL units. Units on a given line execute40* in parallel. Subsequent lines execute separately and can pass results41* directly via pipeline registers r24/r25, bypassing the register file.42*43* A bundle can optionally have 128-bits of embedded constants, shared across44* all of the instructions within a bundle.45*46* Instructions consuming conditionals (branches and conditional selects)47* require their condition to be written into the conditional register (r31)48* within the same bundle they are consumed.49*50* Fragment writeout requires its argument to be written in full within the51* same bundle as the branch, with no hanging dependencies.52*53* Load/store instructions are also in bundles of simply two instructions, and54* texture instructions have no bundling.55*56* -------------------------------------------------------------------------57*58*/5960/* We create the dependency graph with per-byte granularity */6162#define BYTE_COUNT 166364static void65add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask, midgard_instruction **instructions, unsigned child)66{67for (unsigned i = 0; i < BYTE_COUNT; ++i) {68if (!(mask & (1 << i)))69continue;7071struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i];7273util_dynarray_foreach(parents, unsigned, parent) {74BITSET_WORD *dependents = instructions[*parent]->dependents;7576/* Already have the dependency */77if (BITSET_TEST(dependents, child))78continue;7980BITSET_SET(dependents, child);81instructions[child]->nr_dependencies++;82}83}84}8586static void87mark_access(struct util_dynarray *table, unsigned index, uint16_t mask, unsigned parent)88{89for (unsigned i = 0; i < BYTE_COUNT; ++i) {90if (!(mask & (1 << i)))91continue;9293util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent);94}95}9697static void98mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)99{100size_t sz = node_count * BYTE_COUNT;101102struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);103struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);104105for (unsigned i = 0; i < sz; ++i) {106util_dynarray_init(&last_read[i], NULL);107util_dynarray_init(&last_write[i], NULL);108}109110/* Initialize dependency graph */111for (unsigned i = 0; i < count; ++i) {112instructions[i]->dependents =113calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));114115instructions[i]->nr_dependencies = 0;116}117118unsigned prev_ldst[3] = {~0, ~0, ~0};119120/* Populate dependency graph */121for (signed i = count - 1; i >= 0; --i) {122if (instructions[i]->compact_branch)123continue;124125unsigned dest = instructions[i]->dest;126unsigned mask = mir_bytemask(instructions[i]);127128mir_foreach_src((*instructions), s) {129unsigned src = instructions[i]->src[s];130131if (src < node_count) {132unsigned readmask = mir_bytemask_of_read_components(instructions[i], src);133add_dependency(last_write, src, readmask, instructions, i);134}135}136137/* Create a list of dependencies for each type of load/store138* instruction to prevent reordering. */139if (instructions[i]->type == TAG_LOAD_STORE_4 &&140load_store_opcode_props[instructions[i]->op].props & LDST_ADDRESS) {141142unsigned type = instructions[i]->load_store.arg_reg |143instructions[i]->load_store.arg_comp;144145unsigned idx;146switch (type) {147case LDST_SHARED: idx = 0; break;148case LDST_SCRATCH: idx = 1; break;149default: idx = 2; break;150}151152unsigned prev = prev_ldst[idx];153154if (prev != ~0) {155BITSET_WORD *dependents = instructions[prev]->dependents;156157/* Already have the dependency */158if (BITSET_TEST(dependents, i))159continue;160161BITSET_SET(dependents, i);162instructions[i]->nr_dependencies++;163}164165prev_ldst[idx] = i;166}167168if (dest < node_count) {169add_dependency(last_read, dest, mask, instructions, i);170add_dependency(last_write, dest, mask, instructions, i);171mark_access(last_write, dest, mask, i);172}173174mir_foreach_src((*instructions), s) {175unsigned src = instructions[i]->src[s];176177if (src < node_count) {178unsigned readmask = mir_bytemask_of_read_components(instructions[i], src);179mark_access(last_read, src, readmask, i);180}181}182}183184/* If there is a branch, all instructions depend on it, as interblock185* execution must be purely in-order */186187if (instructions[count - 1]->compact_branch) {188BITSET_WORD *dependents = instructions[count - 1]->dependents;189190for (signed i = count - 2; i >= 0; --i) {191if (BITSET_TEST(dependents, i))192continue;193194BITSET_SET(dependents, i);195instructions[i]->nr_dependencies++;196}197}198199/* Free the intermediate structures */200for (unsigned i = 0; i < sz; ++i) {201util_dynarray_fini(&last_read[i]);202util_dynarray_fini(&last_write[i]);203}204205free(last_read);206free(last_write);207}208209/* Does the mask cover more than a scalar? */210211static bool212is_single_component_mask(unsigned mask)213{214int components = 0;215216for (int c = 0; c < 8; ++c) {217if (mask & (1 << c))218components++;219}220221return components == 1;222}223224/* Helpers for scheudling */225226static bool227mir_is_scalar(midgard_instruction *ains)228{229/* Do we try to use it as a vector op? */230if (!is_single_component_mask(ains->mask))231return false;232233/* Otherwise, check mode hazards */234bool could_scalar = true;235unsigned szd = nir_alu_type_get_type_size(ains->dest_type);236unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]);237unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]);238239/* Only 16/32-bit can run on a scalar unit */240could_scalar &= (szd == 16) || (szd == 32);241242if (ains->src[0] != ~0)243could_scalar &= (sz0 == 16) || (sz0 == 32);244245if (ains->src[1] != ~0)246could_scalar &= (sz1 == 16) || (sz1 == 32);247248if (midgard_is_integer_out_op(ains->op) && ains->outmod != midgard_outmod_keeplo)249return false;250251return could_scalar;252}253254/* How many bytes does this ALU instruction add to the bundle? */255256static unsigned257bytes_for_instruction(midgard_instruction *ains)258{259if (ains->unit & UNITS_ANY_VECTOR)260return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);261else if (ains->unit == ALU_ENAB_BRANCH)262return sizeof(midgard_branch_extended);263else if (ains->compact_branch)264return sizeof(uint16_t);265else266return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);267}268269/* We would like to flatten the linked list of midgard_instructions in a bundle270* to an array of pointers on the heap for easy indexing */271272static midgard_instruction **273flatten_mir(midgard_block *block, unsigned *len)274{275*len = list_length(&block->base.instructions);276277if (!(*len))278return NULL;279280midgard_instruction **instructions =281calloc(sizeof(midgard_instruction *), *len);282283unsigned i = 0;284285mir_foreach_instr_in_block(block, ins)286instructions[i++] = ins;287288return instructions;289}290291/* The worklist is the set of instructions that can be scheduled now; that is,292* the set of instructions with no remaining dependencies */293294static void295mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count)296{297for (unsigned i = 0; i < count; ++i) {298if (instructions[i]->nr_dependencies == 0)299BITSET_SET(worklist, i);300}301}302303/* Update the worklist after an instruction terminates. Remove its edges from304* the graph and if that causes any node to have no dependencies, add it to the305* worklist */306307static void308mir_update_worklist(309BITSET_WORD *worklist, unsigned count,310midgard_instruction **instructions, midgard_instruction *done)311{312/* Sanity check: if no instruction terminated, there is nothing to do.313* If the instruction that terminated had dependencies, that makes no314* sense and means we messed up the worklist. Finally, as the purpose315* of this routine is to update dependents, we abort early if there are316* no dependents defined. */317318if (!done)319return;320321assert(done->nr_dependencies == 0);322323if (!done->dependents)324return;325326/* We have an instruction with dependents. Iterate each dependent to327* remove one dependency (`done`), adding dependents to the worklist328* where possible. */329330unsigned i;331BITSET_FOREACH_SET(i, done->dependents, count) {332assert(instructions[i]->nr_dependencies);333334if (!(--instructions[i]->nr_dependencies))335BITSET_SET(worklist, i);336}337338free(done->dependents);339}340341/* While scheduling, we need to choose instructions satisfying certain342* criteria. As we schedule backwards, we choose the *last* instruction in the343* worklist to simulate in-order scheduling. Chosen instructions must satisfy a344* given predicate. */345346struct midgard_predicate {347/* TAG or ~0 for dont-care */348unsigned tag;349350/* True if we want to pop off the chosen instruction */351bool destructive;352353/* For ALU, choose only this unit */354unsigned unit;355356/* State for bundle constants. constants is the actual constants357* for the bundle. constant_count is the number of bytes (up to358* 16) currently in use for constants. When picking in destructive359* mode, the constants array will be updated, and the instruction360* will be adjusted to index into the constants array */361362midgard_constants *constants;363unsigned constant_mask;364365/* Exclude this destination (if not ~0) */366unsigned exclude;367368/* Don't schedule instructions consuming conditionals (since we already369* scheduled one). Excludes conditional branches and csel */370bool no_cond;371372/* Require (or reject) a minimal mask and (if nonzero) given373* destination. Used for writeout optimizations */374375unsigned mask;376unsigned no_mask;377unsigned dest;378379/* Whether to not-care/only/never schedule imov/fmov instructions This380* allows non-move instructions to get priority on each unit */381unsigned move_mode;382383/* For load/store: how many pipeline registers are in use? The two384* scheduled instructions cannot use more than the 256-bits of pipeline385* space available or RA will fail (as it would run out of pipeline386* registers and fail to spill without breaking the schedule) */387388unsigned pipeline_count;389};390391static bool392mir_adjust_constant(midgard_instruction *ins, unsigned src,393unsigned *bundle_constant_mask,394unsigned *comp_mapping,395uint8_t *bundle_constants,396bool upper)397{398unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8;399unsigned type_shift = util_logbase2(type_size);400unsigned max_comp = mir_components_for_type(ins->src_types[src]);401unsigned comp_mask = mir_from_bytemask(mir_round_bytemask_up(402mir_bytemask_of_read_components_index(ins, src),403type_size * 8),404type_size * 8);405unsigned type_mask = (1 << type_size) - 1;406407/* Upper only makes sense for 16-bit */408if (type_size != 16 && upper)409return false;410411/* For 16-bit, we need to stay on either upper or lower halves to avoid412* disrupting the swizzle */413unsigned start = upper ? 8 : 0;414unsigned length = (type_size == 2) ? 8 : 16;415416for (unsigned comp = 0; comp < max_comp; comp++) {417if (!(comp_mask & (1 << comp)))418continue;419420uint8_t *constantp = ins->constants.u8 + (type_size * comp);421unsigned best_reuse_bytes = 0;422signed best_place = -1;423unsigned i, j;424425for (i = start; i < (start + length); i += type_size) {426unsigned reuse_bytes = 0;427428for (j = 0; j < type_size; j++) {429if (!(*bundle_constant_mask & (1 << (i + j))))430continue;431if (constantp[j] != bundle_constants[i + j])432break;433if ((i + j) > (start + length))434break;435436reuse_bytes++;437}438439/* Select the place where existing bytes can be440* reused so we leave empty slots to others441*/442if (j == type_size &&443(reuse_bytes > best_reuse_bytes || best_place < 0)) {444best_reuse_bytes = reuse_bytes;445best_place = i;446break;447}448}449450/* This component couldn't fit in the remaining constant slot,451* no need check the remaining components, bail out now452*/453if (best_place < 0)454return false;455456memcpy(&bundle_constants[i], constantp, type_size);457*bundle_constant_mask |= type_mask << best_place;458comp_mapping[comp] = best_place >> type_shift;459}460461return true;462}463464/* For an instruction that can fit, adjust it to fit and update the constants465* array, in destructive mode. Returns whether the fitting was successful. */466467static bool468mir_adjust_constants(midgard_instruction *ins,469struct midgard_predicate *pred,470bool destructive)471{472/* No constant, nothing to adjust */473if (!ins->has_constants)474return true;475476unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);477unsigned bundle_constant_mask = pred->constant_mask;478unsigned comp_mapping[2][16] = { };479uint8_t bundle_constants[16];480481memcpy(bundle_constants, pred->constants, 16);482483/* Let's try to find a place for each active component of the constant484* register.485*/486for (unsigned src = 0; src < 2; ++src) {487if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT))488continue;489490/* First, try lower half (or whole for !16) */491if (mir_adjust_constant(ins, src, &bundle_constant_mask,492comp_mapping[src], bundle_constants, false))493continue;494495/* Next, try upper half */496if (mir_adjust_constant(ins, src, &bundle_constant_mask,497comp_mapping[src], bundle_constants, true))498continue;499500/* Otherwise bail */501return false;502}503504/* If non-destructive, we're done */505if (!destructive)506return true;507508/* Otherwise update the constant_mask and constant values */509pred->constant_mask = bundle_constant_mask;510memcpy(pred->constants, bundle_constants, 16);511512/* Use comp_mapping as a swizzle */513mir_foreach_src(ins, s) {514if (ins->src[s] == r_constant)515mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]);516}517518return true;519}520521/* Conservative estimate of the pipeline registers required for load/store */522523static unsigned524mir_pipeline_count(midgard_instruction *ins)525{526unsigned bytecount = 0;527528mir_foreach_src(ins, i) {529/* Skip empty source */530if (ins->src[i] == ~0) continue;531532if (i == 0) {533/* First source is a vector, worst-case the mask */534unsigned bytemask = mir_bytemask_of_read_components_index(ins, i);535unsigned max = util_logbase2(bytemask) + 1;536bytecount += max;537} else {538/* Sources 1 on are scalars */539bytecount += 4;540}541}542543unsigned dwords = DIV_ROUND_UP(bytecount, 16);544assert(dwords <= 2);545546return dwords;547}548549/* Matches FADD x, x with modifiers compatible. Since x + x = x * 2, for550* any x including of the form f(y) for some swizzle/abs/neg function f */551552static bool553mir_is_add_2(midgard_instruction *ins)554{555if (ins->op != midgard_alu_op_fadd)556return false;557558if (ins->src[0] != ins->src[1])559return false;560561if (ins->src_types[0] != ins->src_types[1])562return false;563564for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) {565if (ins->swizzle[0][i] != ins->swizzle[1][i])566return false;567}568569if (ins->src_abs[0] != ins->src_abs[1])570return false;571572if (ins->src_neg[0] != ins->src_neg[1])573return false;574575return true;576}577578static void579mir_adjust_unit(midgard_instruction *ins, unsigned unit)580{581/* FADD x, x = FMUL x, #2 */582if (mir_is_add_2(ins) && (unit & (UNITS_MUL | UNIT_VLUT))) {583ins->op = midgard_alu_op_fmul;584585ins->src[1] = ~0;586ins->src_abs[1] = false;587ins->src_neg[1] = false;588589ins->has_inline_constant = true;590ins->inline_constant = _mesa_float_to_half(2.0);591}592}593594static unsigned595mir_has_unit(midgard_instruction *ins, unsigned unit)596{597if (alu_opcode_props[ins->op].props & unit)598return true;599600/* FADD x, x can run on any adder or any multiplier */601if (mir_is_add_2(ins))602return true;603604return false;605}606607/* Net change in liveness if an instruction were scheduled. Loosely based on608* ir3's scheduler. */609610static int611mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive)612{613/* TODO: what if dest is used multiple times? */614int free_live = 0;615616if (ins->dest < SSA_FIXED_MINIMUM) {617unsigned bytemask = mir_bytemask(ins);618bytemask = util_next_power_of_two(bytemask + 1) - 1;619free_live += util_bitcount(liveness[ins->dest] & bytemask);620621if (destructive)622liveness[ins->dest] &= ~bytemask;623}624625int new_live = 0;626627mir_foreach_src(ins, s) {628unsigned S = ins->src[s];629630bool dupe = false;631632for (unsigned q = 0; q < s; ++q)633dupe |= (ins->src[q] == S);634635if (dupe)636continue;637638if (S < SSA_FIXED_MINIMUM) {639unsigned bytemask = mir_bytemask_of_read_components(ins, S);640bytemask = util_next_power_of_two(bytemask + 1) - 1;641642/* Count only the new components */643new_live += util_bitcount(bytemask & ~(liveness[S]));644645if (destructive)646liveness[S] |= bytemask;647}648}649650return new_live - free_live;651}652653static midgard_instruction *654mir_choose_instruction(655midgard_instruction **instructions,656uint16_t *liveness,657BITSET_WORD *worklist, unsigned count,658struct midgard_predicate *predicate)659{660/* Parse the predicate */661unsigned tag = predicate->tag;662unsigned unit = predicate->unit;663bool scalar = (unit != ~0) && (unit & UNITS_SCALAR);664bool no_cond = predicate->no_cond;665666unsigned mask = predicate->mask;667unsigned dest = predicate->dest;668bool needs_dest = mask & 0xF;669670/* Iterate to find the best instruction satisfying the predicate */671unsigned i;672673signed best_index = -1;674signed best_effect = INT_MAX;675bool best_conditional = false;676677/* Enforce a simple metric limiting distance to keep down register678* pressure. TOOD: replace with liveness tracking for much better679* results */680681unsigned max_active = 0;682unsigned max_distance = 36;683684#ifndef NDEBUG685/* Force in-order scheduling */686if (midgard_debug & MIDGARD_DBG_INORDER)687max_distance = 1;688#endif689690BITSET_FOREACH_SET(i, worklist, count) {691max_active = MAX2(max_active, i);692}693694BITSET_FOREACH_SET(i, worklist, count) {695if ((max_active - i) >= max_distance)696continue;697698if (tag != ~0 && instructions[i]->type != tag)699continue;700701bool alu = (instructions[i]->type == TAG_ALU_4);702bool ldst = (instructions[i]->type == TAG_LOAD_STORE_4);703704bool branch = alu && (unit == ALU_ENAB_BR_COMPACT);705bool is_move = alu &&706(instructions[i]->op == midgard_alu_op_imov ||707instructions[i]->op == midgard_alu_op_fmov);708709if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude)710continue;711712if (alu && !branch && unit != ~0 && !(mir_has_unit(instructions[i], unit)))713continue;714715/* 0: don't care, 1: no moves, 2: only moves */716if (predicate->move_mode && ((predicate->move_mode - 1) != is_move))717continue;718719if (branch && !instructions[i]->compact_branch)720continue;721722if (alu && scalar && !mir_is_scalar(instructions[i]))723continue;724725if (alu && predicate->constants && !mir_adjust_constants(instructions[i], predicate, false))726continue;727728if (needs_dest && instructions[i]->dest != dest)729continue;730731if (mask && ((~instructions[i]->mask) & mask))732continue;733734if (instructions[i]->mask & predicate->no_mask)735continue;736737if (ldst && mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2)738continue;739740bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->op);741conditional |= (branch && instructions[i]->branch.conditional);742743if (conditional && no_cond)744continue;745746int effect = mir_live_effect(liveness, instructions[i], false);747748if (effect > best_effect)749continue;750751if (effect == best_effect && (signed) i < best_index)752continue;753754best_effect = effect;755best_index = i;756best_conditional = conditional;757}758759/* Did we find anything? */760761if (best_index < 0)762return NULL;763764/* If we found something, remove it from the worklist */765assert(best_index < count);766midgard_instruction *I = instructions[best_index];767768if (predicate->destructive) {769BITSET_CLEAR(worklist, best_index);770771if (I->type == TAG_ALU_4)772mir_adjust_constants(instructions[best_index], predicate, true);773774if (I->type == TAG_LOAD_STORE_4)775predicate->pipeline_count += mir_pipeline_count(instructions[best_index]);776777if (I->type == TAG_ALU_4)778mir_adjust_unit(instructions[best_index], unit);779780/* Once we schedule a conditional, we can't again */781predicate->no_cond |= best_conditional;782mir_live_effect(liveness, instructions[best_index], true);783}784785return I;786}787788/* Still, we don't choose instructions in a vacuum. We need a way to choose the789* best bundle type (ALU, load/store, texture). Nondestructive. */790791static unsigned792mir_choose_bundle(793midgard_instruction **instructions,794uint16_t *liveness,795BITSET_WORD *worklist, unsigned count,796unsigned num_ldst)797{798/* At the moment, our algorithm is very simple - use the bundle of the799* best instruction, regardless of what else could be scheduled800* alongside it. This is not optimal but it works okay for in-order */801802struct midgard_predicate predicate = {803.tag = ~0,804.unit = ~0,805.destructive = false,806.exclude = ~0807};808809midgard_instruction *chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate);810811if (chosen && chosen->type == TAG_LOAD_STORE_4 && !(num_ldst % 2)) {812/* Try to schedule load/store ops in pairs */813814predicate.exclude = chosen->dest;815predicate.tag = TAG_LOAD_STORE_4;816817chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate);818if (chosen)819return TAG_LOAD_STORE_4;820821predicate.tag = ~0;822823chosen = mir_choose_instruction(instructions, liveness, worklist, count, &predicate);824assert(chosen == NULL || chosen->type != TAG_LOAD_STORE_4);825826if (chosen)827return chosen->type;828else829return TAG_LOAD_STORE_4;830}831832if (chosen)833return chosen->type;834else835return ~0;836}837838/* We want to choose an ALU instruction filling a given unit */839static void840mir_choose_alu(midgard_instruction **slot,841midgard_instruction **instructions,842uint16_t *liveness,843BITSET_WORD *worklist, unsigned len,844struct midgard_predicate *predicate,845unsigned unit)846{847/* Did we already schedule to this slot? */848if ((*slot) != NULL)849return;850851/* Try to schedule something, if not */852predicate->unit = unit;853*slot = mir_choose_instruction(instructions, liveness, worklist, len, predicate);854855/* Store unit upon scheduling */856if (*slot && !((*slot)->compact_branch))857(*slot)->unit = unit;858}859860/* When we are scheduling a branch/csel, we need the consumed condition in the861* same block as a pipeline register. There are two options to enable this:862*863* - Move the conditional into the bundle. Preferred, but only works if the864* conditional is used only once and is from this block.865* - Copy the conditional.866*867* We search for the conditional. If it's in this block, single-use, and868* without embedded constants, we schedule it immediately. Otherwise, we869* schedule a move for it.870*871* mir_comparison_mobile is a helper to find the moveable condition.872*/873874static unsigned875mir_comparison_mobile(876compiler_context *ctx,877midgard_instruction **instructions,878struct midgard_predicate *predicate,879unsigned count,880unsigned cond)881{882if (!mir_single_use(ctx, cond))883return ~0;884885unsigned ret = ~0;886887for (unsigned i = 0; i < count; ++i) {888if (instructions[i]->dest != cond)889continue;890891/* Must fit in an ALU bundle */892if (instructions[i]->type != TAG_ALU_4)893return ~0;894895/* If it would itself require a condition, that's recursive */896if (OP_IS_CSEL(instructions[i]->op))897return ~0;898899/* We'll need to rewrite to .w but that doesn't work for vector900* ops that don't replicate (ball/bany), so bail there */901902if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->op].props))903return ~0;904905/* Ensure it will fit with constants */906907if (!mir_adjust_constants(instructions[i], predicate, false))908return ~0;909910/* Ensure it is written only once */911912if (ret != ~0)913return ~0;914else915ret = i;916}917918/* Inject constants now that we are sure we want to */919if (ret != ~0)920mir_adjust_constants(instructions[ret], predicate, true);921922return ret;923}924925/* Using the information about the moveable conditional itself, we either pop926* that condition off the worklist for use now, or create a move to927* artificially schedule instead as a fallback */928929static midgard_instruction *930mir_schedule_comparison(931compiler_context *ctx,932midgard_instruction **instructions,933struct midgard_predicate *predicate,934BITSET_WORD *worklist, unsigned count,935unsigned cond, bool vector, unsigned *swizzle,936midgard_instruction *user)937{938/* TODO: swizzle when scheduling */939unsigned comp_i =940(!vector && (swizzle[0] == 0)) ?941mir_comparison_mobile(ctx, instructions, predicate, count, cond) : ~0;942943/* If we can, schedule the condition immediately */944if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) {945assert(comp_i < count);946BITSET_CLEAR(worklist, comp_i);947return instructions[comp_i];948}949950/* Otherwise, we insert a move */951952midgard_instruction mov = v_mov(cond, cond);953mov.mask = vector ? 0xF : 0x1;954memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1]));955956return mir_insert_instruction_before(ctx, user, mov);957}958959/* Most generally, we need instructions writing to r31 in the appropriate960* components */961962static midgard_instruction *963mir_schedule_condition(compiler_context *ctx,964struct midgard_predicate *predicate,965BITSET_WORD *worklist, unsigned count,966midgard_instruction **instructions,967midgard_instruction *last)968{969/* For a branch, the condition is the only argument; for csel, third */970bool branch = last->compact_branch;971unsigned condition_index = branch ? 0 : 2;972973/* csel_v is vector; otherwise, conditions are scalar */974bool vector = !branch && OP_IS_CSEL_V(last->op);975976/* Grab the conditional instruction */977978midgard_instruction *cond = mir_schedule_comparison(979ctx, instructions, predicate, worklist, count, last->src[condition_index],980vector, last->swizzle[2], last);981982/* We have exclusive reign over this (possibly move) conditional983* instruction. We can rewrite into a pipeline conditional register */984985predicate->exclude = cond->dest;986cond->dest = SSA_FIXED_REGISTER(31);987988if (!vector) {989cond->mask = (1 << COMPONENT_W);990991mir_foreach_src(cond, s) {992if (cond->src[s] == ~0)993continue;994995for (unsigned q = 0; q < 4; ++q)996cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q];997}998}9991000/* Schedule the unit: csel is always in the latter pipeline, so a csel1001* condition must be in the former pipeline stage (vmul/sadd),1002* depending on scalar/vector of the instruction itself. A branch must1003* be written from the latter pipeline stage and a branch condition is1004* always scalar, so it is always in smul (exception: ball/bany, which1005* will be vadd) */10061007if (branch)1008cond->unit = UNIT_SMUL;1009else1010cond->unit = vector ? UNIT_VMUL : UNIT_SADD;10111012return cond;1013}10141015/* Schedules a single bundle of the given type */10161017static midgard_bundle1018mir_schedule_texture(1019midgard_instruction **instructions,1020uint16_t *liveness,1021BITSET_WORD *worklist, unsigned len,1022bool is_vertex)1023{1024struct midgard_predicate predicate = {1025.tag = TAG_TEXTURE_4,1026.destructive = true,1027.exclude = ~01028};10291030midgard_instruction *ins =1031mir_choose_instruction(instructions, liveness, worklist, len, &predicate);10321033mir_update_worklist(worklist, len, instructions, ins);10341035struct midgard_bundle out = {1036.tag = ins->op == midgard_tex_op_barrier ?1037TAG_TEXTURE_4_BARRIER :1038(ins->op == midgard_tex_op_fetch) || is_vertex ?1039TAG_TEXTURE_4_VTX : TAG_TEXTURE_4,1040.instruction_count = 1,1041.instructions = { ins }1042};10431044return out;1045}10461047static midgard_bundle1048mir_schedule_ldst(1049midgard_instruction **instructions,1050uint16_t *liveness,1051BITSET_WORD *worklist, unsigned len,1052unsigned *num_ldst)1053{1054struct midgard_predicate predicate = {1055.tag = TAG_LOAD_STORE_4,1056.destructive = true,1057.exclude = ~01058};10591060/* Try to pick two load/store ops. Second not gauranteed to exist */10611062midgard_instruction *ins =1063mir_choose_instruction(instructions, liveness, worklist, len, &predicate);10641065midgard_instruction *pair =1066mir_choose_instruction(instructions, liveness, worklist, len, &predicate);10671068assert(ins != NULL);10691070struct midgard_bundle out = {1071.tag = TAG_LOAD_STORE_4,1072.instruction_count = pair ? 2 : 1,1073.instructions = { ins, pair }1074};10751076*num_ldst -= out.instruction_count;10771078/* We have to update the worklist atomically, since the two1079* instructions run concurrently (TODO: verify it's not pipelined) */10801081mir_update_worklist(worklist, len, instructions, ins);1082mir_update_worklist(worklist, len, instructions, pair);10831084return out;1085}10861087static void1088mir_schedule_zs_write(1089compiler_context *ctx,1090struct midgard_predicate *predicate,1091midgard_instruction **instructions,1092uint16_t *liveness,1093BITSET_WORD *worklist, unsigned len,1094midgard_instruction *branch,1095midgard_instruction **smul,1096midgard_instruction **vadd,1097midgard_instruction **vlut,1098bool stencil)1099{1100bool success = false;1101unsigned idx = stencil ? 3 : 2;1102unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(1) : branch->src[idx];11031104predicate->dest = src;1105predicate->mask = 0x1;11061107midgard_instruction **units[] = { smul, vadd, vlut };1108unsigned unit_names[] = { UNIT_SMUL, UNIT_VADD, UNIT_VLUT };11091110for (unsigned i = 0; i < 3; ++i) {1111if (*(units[i]))1112continue;11131114predicate->unit = unit_names[i];1115midgard_instruction *ins =1116mir_choose_instruction(instructions, liveness, worklist, len, predicate);11171118if (ins) {1119ins->unit = unit_names[i];1120*(units[i]) = ins;1121success |= true;1122break;1123}1124}11251126predicate->dest = predicate->mask = 0;11271128if (success)1129return;11301131midgard_instruction *mov = ralloc(ctx, midgard_instruction);1132*mov = v_mov(src, make_compiler_temp(ctx));1133mov->mask = 0x1;11341135branch->src[idx] = mov->dest;11361137if (stencil) {1138unsigned swizzle = (branch->src[0] == ~0) ? COMPONENT_Y : COMPONENT_X;11391140for (unsigned c = 0; c < 16; ++c)1141mov->swizzle[1][c] = swizzle;1142}11431144for (unsigned i = 0; i < 3; ++i) {1145if (!(*(units[i]))) {1146*(units[i]) = mov;1147mov->unit = unit_names[i];1148return;1149}1150}11511152unreachable("Could not schedule Z/S move to any unit");1153}11541155static midgard_bundle1156mir_schedule_alu(1157compiler_context *ctx,1158midgard_instruction **instructions,1159uint16_t *liveness,1160BITSET_WORD *worklist, unsigned len)1161{1162struct midgard_bundle bundle = {};11631164unsigned bytes_emitted = sizeof(bundle.control);11651166struct midgard_predicate predicate = {1167.tag = TAG_ALU_4,1168.destructive = true,1169.exclude = ~0,1170.constants = &bundle.constants1171};11721173midgard_instruction *vmul = NULL;1174midgard_instruction *vadd = NULL;1175midgard_instruction *vlut = NULL;1176midgard_instruction *smul = NULL;1177midgard_instruction *sadd = NULL;1178midgard_instruction *branch = NULL;11791180mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate, ALU_ENAB_BR_COMPACT);1181mir_update_worklist(worklist, len, instructions, branch);1182unsigned writeout = branch ? branch->writeout : 0;11831184if (branch && branch->branch.conditional) {1185midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, branch);11861187if (cond->unit == UNIT_VADD)1188vadd = cond;1189else if (cond->unit == UNIT_SMUL)1190smul = cond;1191else1192unreachable("Bad condition");1193}11941195/* If we have a render target reference, schedule a move for it. Since1196* this will be in sadd, we boost this to prevent scheduling csel into1197* smul */11981199if (writeout && (branch->constants.u32[0] || ctx->inputs->is_blend)) {1200sadd = ralloc(ctx, midgard_instruction);1201*sadd = v_mov(~0, make_compiler_temp(ctx));1202sadd->unit = UNIT_SADD;1203sadd->mask = 0x1;1204sadd->has_inline_constant = true;1205sadd->inline_constant = branch->constants.u32[0];1206branch->src[1] = sadd->dest;1207branch->src_types[1] = sadd->dest_type;1208}12091210if (writeout) {1211/* Propagate up */1212bundle.last_writeout = branch->last_writeout;12131214/* Mask off any conditionals.1215* This prevents csel and csel_v being scheduled into smul1216* since we might not have room for a conditional in vmul/sadd.1217* This is important because both writeout and csel have same-bundle1218* requirements on their dependencies. */1219predicate.no_cond = true;1220}12211222/* When MRT is in use, writeout loops require r1.w to be filled with a1223* return address for the blend shader to jump to. We always emit the1224* move for blend shaders themselves for ABI reasons. */12251226if (writeout && (ctx->inputs->is_blend || ctx->writeout_branch[1])) {1227vadd = ralloc(ctx, midgard_instruction);1228*vadd = v_mov(~0, make_compiler_temp(ctx));12291230if (!ctx->inputs->is_blend) {1231vadd->op = midgard_alu_op_iadd;1232vadd->src[0] = SSA_FIXED_REGISTER(31);1233vadd->src_types[0] = nir_type_uint32;12341235for (unsigned c = 0; c < 16; ++c)1236vadd->swizzle[0][c] = COMPONENT_X;12371238vadd->has_inline_constant = true;1239vadd->inline_constant = 0;1240} else {1241vadd->src[1] = SSA_FIXED_REGISTER(1);1242vadd->src_types[0] = nir_type_uint32;12431244for (unsigned c = 0; c < 16; ++c)1245vadd->swizzle[1][c] = COMPONENT_W;1246}12471248vadd->unit = UNIT_VADD;1249vadd->mask = 0x1;1250branch->dest = vadd->dest;1251branch->dest_type = vadd->dest_type;1252}12531254if (writeout & PAN_WRITEOUT_Z)1255mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, false);12561257if (writeout & PAN_WRITEOUT_S)1258mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist, len, branch, &smul, &vadd, &vlut, true);12591260mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate, UNIT_SMUL);12611262for (unsigned mode = 1; mode < 3; ++mode) {1263predicate.move_mode = mode;1264predicate.no_mask = writeout ? (1 << 3) : 0;1265mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate, UNIT_VLUT);1266predicate.no_mask = 0;1267mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate, UNIT_VADD);1268}12691270/* Reset */1271predicate.move_mode = 0;12721273mir_update_worklist(worklist, len, instructions, vlut);1274mir_update_worklist(worklist, len, instructions, vadd);1275mir_update_worklist(worklist, len, instructions, smul);12761277bool vadd_csel = vadd && OP_IS_CSEL(vadd->op);1278bool smul_csel = smul && OP_IS_CSEL(smul->op);12791280if (vadd_csel || smul_csel) {1281midgard_instruction *ins = vadd_csel ? vadd : smul;1282midgard_instruction *cond = mir_schedule_condition(ctx, &predicate, worklist, len, instructions, ins);12831284if (cond->unit == UNIT_VMUL)1285vmul = cond;1286else if (cond->unit == UNIT_SADD)1287sadd = cond;1288else1289unreachable("Bad condition");1290}12911292/* Stage 2, let's schedule sadd before vmul for writeout */1293mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate, UNIT_SADD);12941295/* Check if writeout reads its own register */12961297if (writeout) {1298midgard_instruction *stages[] = { sadd, vadd, smul, vlut };1299unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0];1300unsigned writeout_mask = 0x0;1301bool bad_writeout = false;13021303for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {1304if (!stages[i])1305continue;13061307if (stages[i]->dest != src)1308continue;13091310writeout_mask |= stages[i]->mask;1311bad_writeout |= mir_has_arg(stages[i], branch->src[0]);1312}13131314/* It's possible we'll be able to schedule something into vmul1315* to fill r0. Let's peak into the future, trying to schedule1316* vmul specially that way. */13171318unsigned full_mask = 0xF;13191320if (!bad_writeout && writeout_mask != full_mask) {1321predicate.unit = UNIT_VMUL;1322predicate.dest = src;1323predicate.mask = writeout_mask ^ full_mask;13241325struct midgard_instruction *peaked =1326mir_choose_instruction(instructions, liveness, worklist, len, &predicate);13271328if (peaked) {1329vmul = peaked;1330vmul->unit = UNIT_VMUL;1331writeout_mask |= predicate.mask;1332assert(writeout_mask == full_mask);1333}13341335/* Cleanup */1336predicate.dest = predicate.mask = 0;1337}13381339/* Finally, add a move if necessary */1340if (bad_writeout || writeout_mask != full_mask) {1341unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx);13421343vmul = ralloc(ctx, midgard_instruction);1344*vmul = v_mov(src, temp);1345vmul->unit = UNIT_VMUL;1346vmul->mask = full_mask ^ writeout_mask;13471348/* Rewrite to use our temp */13491350for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {1351if (stages[i]) {1352mir_rewrite_index_dst_single(stages[i], src, temp);1353mir_rewrite_index_src_single(stages[i], src, temp);1354}1355}13561357mir_rewrite_index_src_single(branch, src, temp);1358}1359}13601361mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate, UNIT_VMUL);13621363mir_update_worklist(worklist, len, instructions, vmul);1364mir_update_worklist(worklist, len, instructions, sadd);13651366bundle.has_embedded_constants = predicate.constant_mask != 0;13671368unsigned padding = 0;13691370/* Now that we have finished scheduling, build up the bundle */1371midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch };13721373for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {1374if (stages[i]) {1375bundle.control |= stages[i]->unit;1376bytes_emitted += bytes_for_instruction(stages[i]);1377bundle.instructions[bundle.instruction_count++] = stages[i];13781379/* If we branch, we can't spill to TLS since the store1380* instruction will never get executed. We could try to1381* break the bundle but this is probably easier for1382* now. */13831384if (branch)1385stages[i]->no_spill |= (1 << REG_CLASS_WORK);1386}1387}13881389/* Pad ALU op to nearest word */13901391if (bytes_emitted & 15) {1392padding = 16 - (bytes_emitted & 15);1393bytes_emitted += padding;1394}13951396/* Constants must always be quadwords */1397if (bundle.has_embedded_constants)1398bytes_emitted += 16;13991400/* Size ALU instruction for tag */1401bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;14021403bool tilebuf_wait = branch && branch->compact_branch &&1404branch->branch.target_type == TARGET_TILEBUF_WAIT;14051406/* MRT capable GPUs use a special writeout procedure */1407if ((writeout || tilebuf_wait) && !(ctx->quirks & MIDGARD_NO_UPPER_ALU))1408bundle.tag += 4;14091410bundle.padding = padding;1411bundle.control |= bundle.tag;14121413return bundle;1414}14151416/* Schedule a single block by iterating its instruction to create bundles.1417* While we go, tally about the bundle sizes to compute the block size. */141814191420static void1421schedule_block(compiler_context *ctx, midgard_block *block)1422{1423/* Copy list to dynamic array */1424unsigned len = 0;1425midgard_instruction **instructions = flatten_mir(block, &len);14261427if (!len)1428return;14291430/* Calculate dependencies and initial worklist */1431unsigned node_count = ctx->temp_count + 1;1432mir_create_dependency_graph(instructions, len, node_count);14331434/* Allocate the worklist */1435size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);1436BITSET_WORD *worklist = calloc(sz, 1);1437uint16_t *liveness = calloc(node_count, 2);1438mir_initialize_worklist(worklist, instructions, len);14391440/* Count the number of load/store instructions so we know when it's1441* worth trying to schedule them in pairs. */1442unsigned num_ldst = 0;1443for (unsigned i = 0; i < len; ++i) {1444if (instructions[i]->type == TAG_LOAD_STORE_4)1445++num_ldst;1446}14471448struct util_dynarray bundles;1449util_dynarray_init(&bundles, NULL);14501451block->quadword_count = 0;14521453for (;;) {1454unsigned tag = mir_choose_bundle(instructions, liveness, worklist, len, num_ldst);1455midgard_bundle bundle;14561457if (tag == TAG_TEXTURE_4)1458bundle = mir_schedule_texture(instructions, liveness, worklist, len, ctx->stage != MESA_SHADER_FRAGMENT);1459else if (tag == TAG_LOAD_STORE_4)1460bundle = mir_schedule_ldst(instructions, liveness, worklist, len, &num_ldst);1461else if (tag == TAG_ALU_4)1462bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len);1463else1464break;14651466for (unsigned i = 0; i < bundle.instruction_count; ++i)1467bundle.instructions[i]->bundle_id =1468ctx->quadword_count + block->quadword_count;14691470util_dynarray_append(&bundles, midgard_bundle, bundle);1471block->quadword_count += midgard_tag_props[bundle.tag].size;1472}14731474assert(num_ldst == 0);14751476/* We emitted bundles backwards; copy into the block in reverse-order */14771478util_dynarray_init(&block->bundles, block);1479util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) {1480util_dynarray_append(&block->bundles, midgard_bundle, *bundle);1481}1482util_dynarray_fini(&bundles);14831484block->scheduled = true;1485ctx->quadword_count += block->quadword_count;14861487/* Reorder instructions to match bundled. First remove existing1488* instructions and then recreate the list */14891490mir_foreach_instr_in_block_safe(block, ins) {1491list_del(&ins->link);1492}14931494mir_foreach_instr_in_block_scheduled_rev(block, ins) {1495list_add(&ins->link, &block->base.instructions);1496}14971498free(instructions); /* Allocated by flatten_mir() */1499free(worklist);1500free(liveness);1501}15021503/* Insert moves to ensure we can register allocate load/store registers */1504static void1505mir_lower_ldst(compiler_context *ctx)1506{1507mir_foreach_instr_global_safe(ctx, I) {1508if (I->type != TAG_LOAD_STORE_4) continue;15091510mir_foreach_src(I, s) {1511if (s == 0) continue;1512if (I->src[s] == ~0) continue;1513if (I->swizzle[s][0] == 0) continue;15141515unsigned temp = make_compiler_temp(ctx);1516midgard_instruction mov = v_mov(I->src[s], temp);1517mov.mask = 0x1;1518mov.dest_type = I->src_types[s];1519for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c)1520mov.swizzle[1][c] = I->swizzle[s][0];15211522mir_insert_instruction_before(ctx, I, mov);1523I->src[s] = mov.dest;1524I->swizzle[s][0] = 0;1525}1526}1527}15281529void1530midgard_schedule_program(compiler_context *ctx)1531{1532mir_lower_ldst(ctx);1533midgard_promote_uniforms(ctx);15341535/* Must be lowered right before scheduling */1536mir_squeeze_index(ctx);1537mir_lower_special_reads(ctx);1538mir_squeeze_index(ctx);15391540/* Lowering can introduce some dead moves */15411542mir_foreach_block(ctx, _block) {1543midgard_block *block = (midgard_block *) _block;1544midgard_opt_dead_move_eliminate(ctx, block);1545schedule_block(ctx, block);1546}15471548}154915501551