Path: blob/21.2-virgl/src/freedreno/ir3/ir3_delay.c
4565 views
/*1* Copyright (C) 2019 Google, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors:23* Rob Clark <[email protected]>24*/2526#include "ir3.h"2728/* The maximum number of nop's we may need to insert between two instructions.29*/30#define MAX_NOPS 63132/* The soft delay for approximating the cost of (ss). On a6xx, it takes the33* number of delay slots to get a SFU result back (ie. using nop's instead of34* (ss) is:35*36* 8 - single warp37* 9 - two warps38* 10 - four warps39*40* and so on. Not quite sure where it tapers out (ie. how many warps share an41* SFU unit). But 10 seems like a reasonable # to choose:42*/43#define SOFT_SS_NOPS 104445/*46* Helpers to figure out the necessary delay slots between instructions. Used47* both in scheduling pass(es) and the final pass to insert any required nop's48* so that the shader program is valid.49*50* Note that this needs to work both pre and post RA, so we can't assume ssa51* src iterators work.52*/5354/* calculate required # of delay slots between the instruction that55* assigns a value and the one that consumes56*/57int58ir3_delayslots(struct ir3_instruction *assigner,59struct ir3_instruction *consumer, unsigned n, bool soft)60{61/* generally don't count false dependencies, since this can just be62* something like a barrier, or SSBO store.63*/64if (__is_false_dep(consumer, n))65return 0;6667/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal68* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch69* handled with sync bits70*/7172if (is_meta(assigner) || is_meta(consumer))73return 0;7475if (writes_addr0(assigner) || writes_addr1(assigner))76return 6;7778if (soft && is_sfu(assigner))79return SOFT_SS_NOPS;8081/* handled via sync flags: */82if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))83return 0;8485/* As far as we know, shader outputs don't need any delay. */86if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)87return 0;8889/* assigner must be alu: */90if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||91is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {92return 6;93} else {94/* In mergedregs mode, there is an extra 2-cycle penalty when half of95* a full-reg is read as a half-reg or when a half-reg is read as a96* full-reg.97*/98bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=99(consumer->srcs[n]->flags & IR3_REG_HALF);100unsigned penalty = mismatched_half ? 2 : 0;101if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {102/* special case, 3rd src to cat3 not required on first cycle */103return 1 + penalty;104} else {105return 3 + penalty;106}107}108}109110static bool111count_instruction(struct ir3_instruction *n)112{113/* NOTE: don't count branch/jump since we don't know yet if they will114* be eliminated later in resolve_jumps().. really should do that115* earlier so we don't have this constraint.116*/117return is_alu(n) ||118(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));119}120121static unsigned122distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)123{124unsigned d = 0;125126/* Note that this relies on incrementally building up the block's127* instruction list.. but this is how scheduling and nopsched128* work.129*/130foreach_instr_rev (n, &block->instr_list) {131if ((n == instr) || (d >= maxd))132return MIN2(maxd, d + n->nop);133if (count_instruction(n))134d = MIN2(maxd, d + 1 + n->repeat + n->nop);135}136137return maxd;138}139140static unsigned141delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,142struct ir3_instruction *consumer, unsigned srcn)143{144unsigned delay = 0;145146if (assigner->opc == OPC_META_PHI)147return 0;148149if (is_meta(assigner)) {150foreach_src_n (src, n, assigner) {151unsigned d;152153if (!src->def)154continue;155156d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);157delay = MAX2(delay, d);158}159} else {160delay = ir3_delayslots(assigner, consumer, srcn, false);161delay -= distance(block, assigner, delay);162}163164return delay;165}166167/**168* Calculate delay for instruction before register allocation, using SSA169* source pointers. This can't handle inter-block dependencies.170*/171unsigned172ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)173{174unsigned delay = 0;175176foreach_src_n (src, i, instr) {177unsigned d = 0;178179if (src->def && src->def->instr->block == block) {180d = delay_calc_srcn_prera(block, src->def->instr, instr, i);181}182183delay = MAX2(delay, d);184}185186return delay;187}188189/* Post-RA, we don't have arrays any more, so we have to be a bit careful here190* and have to handle relative accesses specially.191*/192193static unsigned194post_ra_reg_elems(struct ir3_register *reg)195{196if (reg->flags & IR3_REG_RELATIV)197return reg->size;198return reg_elems(reg);199}200201static unsigned202post_ra_reg_num(struct ir3_register *reg)203{204if (reg->flags & IR3_REG_RELATIV)205return reg->array.base;206return reg->num;207}208209static unsigned210delay_calc_srcn_postra(struct ir3_instruction *assigner,211struct ir3_instruction *consumer, unsigned assigner_n,212unsigned consumer_n, bool soft, bool mergedregs)213{214struct ir3_register *src = consumer->srcs[consumer_n];215struct ir3_register *dst = assigner->dsts[assigner_n];216bool mismatched_half =217(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);218219/* In the mergedregs case or when the register is a special register,220* half-registers do not alias with full registers.221*/222if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&223mismatched_half)224return 0;225226unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);227unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);228unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);229unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);230231if (dst_start >= src_end || src_start >= dst_end)232return 0;233234unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);235236if (assigner->repeat == 0 && consumer->repeat == 0)237return delay;238239/* If either side is a relative access, we can't really apply most of the240* reasoning below because we don't know which component aliases which.241* Just bail in this case.242*/243if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))244return delay;245246/* MOVMSK seems to require that all users wait until the entire247* instruction is finished, so just bail here.248*/249if (assigner->opc == OPC_MOVMSK)250return delay;251252/* TODO: Handle the combination of (rpt) and different component sizes253* better like below. This complicates things significantly because the254* components don't line up.255*/256if (mismatched_half)257return delay;258259/* If an instruction has a (rpt), then it acts as a sequence of260* instructions, reading its non-(r) sources at each cycle. First, get the261* register num for the first instruction where they interfere:262*/263264unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);265266/* Now, for that first conflicting half/full register, figure out the267* sub-instruction within assigner/consumer it corresponds to. For (r)268* sources, this should already return the correct answer of 0. However we269* have to special-case the multi-mov instructions, where the270* sub-instructions sometimes come from the src/dst indices instead.271*/272unsigned first_src_instr;273if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)274first_src_instr = consumer_n;275else276first_src_instr = first_num - src->num;277278unsigned first_dst_instr;279if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)280first_dst_instr = assigner_n;281else282first_dst_instr = first_num - dst->num;283284/* The delay we return is relative to the *end* of assigner and the285* *beginning* of consumer, because it's the number of nops (or other286* things) needed between them. Any instructions after first_dst_instr287* subtract from the delay, and so do any instructions before288* first_src_instr. Calculate an offset to subtract from the non-rpt-aware289* delay to account for that.290*291* Now, a priori, we need to go through this process for every292* conflicting regnum and take the minimum of the offsets to make sure293* that the appropriate number of nop's is inserted for every conflicting294* pair of sub-instructions. However, as we go to the next conflicting295* regnum (if any), the number of instructions after first_dst_instr296* decreases by 1 and the number of source instructions before297* first_src_instr correspondingly increases by 1, so the offset stays the298* same for all conflicting registers.299*/300unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);301return offset > delay ? 0 : delay - offset;302}303304static unsigned305delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,306struct ir3_instruction *consumer, unsigned distance,307bool soft, bool pred, bool mergedregs)308{309unsigned delay = 0;310/* Search backwards starting at the instruction before start, unless it's311* NULL then search backwards from the block end.312*/313struct list_head *start_list =314start ? start->node.prev : block->instr_list.prev;315list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,316&block->instr_list, node) {317if (count_instruction(assigner))318distance += assigner->nop;319320if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))321return delay;322323if (is_meta(assigner))324continue;325326unsigned new_delay = 0;327328foreach_dst_n (dst, dst_n, assigner) {329if (dst->wrmask == 0)330continue;331foreach_src_n (src, src_n, consumer) {332if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))333continue;334335unsigned src_delay = delay_calc_srcn_postra(336assigner, consumer, dst_n, src_n, soft, mergedregs);337new_delay = MAX2(new_delay, src_delay);338}339}340341new_delay = new_delay > distance ? new_delay - distance : 0;342delay = MAX2(delay, new_delay);343344if (count_instruction(assigner))345distance += 1 + assigner->repeat;346}347348/* Note: this allows recursion into "block" if it has already been349* visited, but *not* recursion into its predecessors. We may have to350* visit the original block twice, for the loop case where we have to351* consider definititons in an earlier iterations of the same loop:352*353* while (...) {354* mov.u32u32 ..., r0.x355* ...356* mov.u32u32 r0.x, ...357* }358*359* However any other recursion would be unnecessary.360*/361362if (pred && block->data != block) {363block->data = block;364365for (unsigned i = 0; i < block->predecessors_count; i++) {366struct ir3_block *pred = block->predecessors[i];367unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,368soft, pred, mergedregs);369delay = MAX2(delay, pred_delay);370}371372block->data = NULL;373}374375return delay;376}377378/**379* Calculate delay for post-RA scheduling based on physical registers but not380* exact (i.e. don't recurse into predecessors, and make it possible to381* estimate impact of sync flags).382*383* @soft: If true, add additional delay for situations where they384* would not be strictly required because a sync flag would be385* used (but scheduler would prefer to schedule some other386* instructions first to avoid stalling on sync flag)387* @mergedregs: True if mergedregs is enabled.388*/389unsigned390ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,391bool soft, bool mergedregs)392{393return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);394}395396/**397* Calculate delay for nop insertion. This must exactly match hardware398* requirements, including recursing into predecessor blocks.399*/400unsigned401ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,402bool mergedregs)403{404return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);405}406407/**408* Remove nop instructions. The scheduler can insert placeholder nop's409* so that ir3_delay_calc() can account for nop's that won't be needed410* due to nop's triggered by a previous instruction. However, before411* legalize, we want to remove these. The legalize pass can insert412* some nop's if needed to hold (for example) sync flags. This final413* remaining nops are inserted by legalize after this.414*/415void416ir3_remove_nops(struct ir3 *ir)417{418foreach_block (block, &ir->block_list) {419foreach_instr_safe (instr, &block->instr_list) {420if (instr->opc == OPC_NOP) {421list_del(&instr->node);422}423}424}425}426427428