Path: blob/21.2-virgl/src/intel/compiler/brw_fs_scoreboard.cpp
4550 views
/*1* Copyright © 2019 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/** @file brw_fs_scoreboard.cpp24*25* Gfx12+ hardware lacks the register scoreboard logic that used to guarantee26* data coherency between register reads and writes in previous generations.27* This lowering pass runs after register allocation in order to make up for28* it.29*30* It works by performing global dataflow analysis in order to determine the31* set of potential dependencies of every instruction in the shader, and then32* inserts any required SWSB annotations and additional SYNC instructions in33* order to guarantee data coherency.34*35* WARNING - Access of the following (rarely used) ARF registers is not36* tracked here, and require the RegDist SWSB annotation to be set37* to 1 by the generator in order to avoid data races:38*39* - sp stack pointer40* - sr0 state register41* - cr0 control register42* - ip instruction pointer43* - tm0 timestamp register44* - dbg0 debug register45* - acc2-9 special accumulator registers on TGL46* - mme0-7 math macro extended accumulator registers47*48* The following ARF registers don't need to be tracked here because data49* coherency is still provided transparently by the hardware:50*51* - f0-1 flag registers52* - n0 notification register53* - tdr0 thread dependency register54*/5556#include "brw_fs.h"57#include "brw_cfg.h"5859using namespace brw;6061namespace {62/**63* In-order instruction accounting.64* @{65*/6667/**68* Return the RegDist pipeline the hardware will synchronize with if no69* pipeline information is provided in the SWSB annotation of an70* instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).71*/72tgl_pipe73inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)74{75if (devinfo->verx10 >= 125) {76bool has_int_src = false, has_long_src = false;7778if (is_send(inst))79return TGL_PIPE_NONE;8081for (unsigned i = 0; i < inst->sources; i++) {82if (inst->src[i].file != BAD_FILE &&83!inst->is_control_source(i)) {84const brw_reg_type t = inst->src[i].type;85has_int_src |= !brw_reg_type_is_floating_point(t);86has_long_src |= type_sz(t) >= 8;87}88}8990return has_long_src ? TGL_PIPE_LONG :91has_int_src ? TGL_PIPE_INT :92TGL_PIPE_FLOAT;9394} else {95return TGL_PIPE_FLOAT;96}97}9899/**100* Return the RegDist pipeline that will execute an instruction, or101* TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the102* RegDist synchronization mechanism.103*/104tgl_pipe105inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)106{107const brw_reg_type t = get_exec_type(inst);108const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&109((inst->opcode == BRW_OPCODE_MUL &&110MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||111(inst->opcode == BRW_OPCODE_MAD &&112MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));113114if (is_unordered(inst))115return TGL_PIPE_NONE;116else if (devinfo->verx10 < 125)117return TGL_PIPE_FLOAT;118else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&119type_sz(t) >= 8)120return TGL_PIPE_INT;121else if (inst->opcode == SHADER_OPCODE_BROADCAST &&122!devinfo->has_64bit_float && type_sz(t) >= 8)123return TGL_PIPE_INT;124else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||125is_dword_multiply)126return TGL_PIPE_LONG;127else if (brw_reg_type_is_floating_point(inst->dst.type))128return TGL_PIPE_FLOAT;129else130return TGL_PIPE_INT;131}132133/**134* Index of the \p p pipeline counter in the ordered_address vector defined135* below.136*/137#define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \138(abort(), ~0u))139140/**141* Number of in-order hardware instructions for pipeline index \p contained142* in this IR instruction. This determines the increment applied to the143* RegDist counter calculated for any ordered dependency that crosses this144* instruction.145*/146unsigned147ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,148unsigned p)149{150switch (inst->opcode) {151case BRW_OPCODE_SYNC:152case BRW_OPCODE_DO:153case SHADER_OPCODE_UNDEF:154case SHADER_OPCODE_HALT_TARGET:155case FS_OPCODE_SCHEDULING_FENCE:156return 0;157default:158/* Note that the following is inaccurate for virtual instructions159* that expand to more in-order instructions than assumed here, but160* that can only lead to suboptimal execution ordering, data161* coherency won't be impacted. Providing exact RegDist counts for162* each virtual instruction would allow better ALU performance, but163* it would require keeping this switch statement in perfect sync164* with the generator in order to avoid data corruption. Lesson is165* (again) don't use virtual instructions if you want optimal166* scheduling.167*/168if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) ||169p == IDX(TGL_PIPE_ALL)))170return 1;171else172return 0;173}174}175176/**177* Type for an instruction counter that increments for in-order178* instructions only, arbitrarily denoted 'jp' throughout this lowering179* pass in order to distinguish it from the regular instruction counter.180* This is represented as a vector with an independent counter for each181* asynchronous ALU pipeline in the EU.182*/183struct ordered_address {184/**185* Construct the ordered address of a dependency known to execute on a186* single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL187* is provided), in which case the vector counter will be initialized188* with all components equal to INT_MIN (always satisfied) except for189* component IDX(p).190*/191ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {192for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)193jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?194INT_MIN : jp0);195}196197int jp[IDX(TGL_PIPE_ALL)];198199friend bool200operator==(const ordered_address &jp0, const ordered_address &jp1)201{202for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {203if (jp0.jp[p] != jp1.jp[p])204return false;205}206207return true;208}209};210211/**212* Return true if the specified ordered address is trivially satisfied for213* all pipelines except potentially for the specified pipeline \p p.214*/215bool216is_single_pipe(const ordered_address &jp, tgl_pipe p)217{218for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {219if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)220return false;221}222223return true;224}225226/**227* Return the number of instructions in the program.228*/229unsigned230num_instructions(const backend_shader *shader)231{232return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;233}234235/**236* Calculate the local ordered_address instruction counter at every237* instruction of the shader for subsequent constant-time look-up.238*/239ordered_address *240ordered_inst_addresses(const fs_visitor *shader)241{242ordered_address *jps = new ordered_address[num_instructions(shader)];243ordered_address jp(TGL_PIPE_ALL, 0);244unsigned ip = 0;245246foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {247jps[ip] = jp;248for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)249jp.jp[p] += ordered_unit(shader->devinfo, inst, p);250ip++;251}252253return jps;254}255256/**257* Synchronization mode required for data manipulated by in-order258* instructions.259*260* Similar to tgl_sbid_mode, but without SET mode. Defined as a separate261* enum for additional type safety. The hardware doesn't provide control262* over the synchronization mode for RegDist annotations, this is only used263* internally in this pass in order to optimize out redundant read264* dependencies where possible.265*/266enum tgl_regdist_mode {267TGL_REGDIST_NULL = 0,268TGL_REGDIST_SRC = 1,269TGL_REGDIST_DST = 2270};271272/**273* Allow bitwise arithmetic of tgl_regdist_mode enums.274*/275tgl_regdist_mode276operator|(tgl_regdist_mode x, tgl_regdist_mode y)277{278return tgl_regdist_mode(unsigned(x) | unsigned(y));279}280281tgl_regdist_mode282operator&(tgl_regdist_mode x, tgl_regdist_mode y)283{284return tgl_regdist_mode(unsigned(x) & unsigned(y));285}286287tgl_regdist_mode &288operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)289{290return x = x | y;291}292293tgl_regdist_mode &294operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)295{296return x = x & y;297}298299/** @} */300301/**302* Representation of an equivalence relation among the set of unsigned303* integers.304*305* Its initial state is the identity relation '~' such that i ~ j if and306* only if i == j for every pair of unsigned integers i and j.307*/308struct equivalence_relation {309equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)310{311for (unsigned i = 0; i < n; i++)312is[i] = i;313}314315~equivalence_relation()316{317delete[] is;318}319320/**321* Return equivalence class index of the specified element. Effectively322* this is the numeric value of an arbitrary representative from the323* equivalence class.324*325* Allows the evaluation of the equivalence relation according to the326* rule that i ~ j if and only if lookup(i) == lookup(j).327*/328unsigned329lookup(unsigned i) const330{331if (i < n && is[i] != i)332return lookup(is[i]);333else334return i;335}336337/**338* Create an array with the results of the lookup() method for339* constant-time evaluation.340*/341unsigned *342flatten() const343{344unsigned *ids = new unsigned[n];345346for (unsigned i = 0; i < n; i++)347ids[i] = lookup(i);348349return ids;350}351352/**353* Mutate the existing equivalence relation minimally by imposing the354* additional requirement that i ~ j.355*356* The algorithm updates the internal representation recursively in357* order to guarantee transitivity while preserving the previously358* specified equivalence requirements.359*/360unsigned361link(unsigned i, unsigned j)362{363const unsigned k = lookup(i);364assign(i, k);365assign(j, k);366return k;367}368369private:370equivalence_relation(const equivalence_relation &);371372equivalence_relation &373operator=(const equivalence_relation &);374375/**376* Assign the representative of \p from to be equivalent to \p to.377*378* At the same time the data structure is partially flattened as much as379* it's possible without increasing the number of recursive calls.380*/381void382assign(unsigned from, unsigned to)383{384if (from != to) {385assert(from < n);386387if (is[from] != from)388assign(is[from], to);389390is[from] = to;391}392}393394unsigned *is;395unsigned n;396};397398/**399* Representation of a data dependency between two instructions in the400* program.401* @{402*/403struct dependency {404/**405* No dependency information.406*/407dependency() : ordered(TGL_REGDIST_NULL), jp(),408unordered(TGL_SBID_NULL), id(0),409exec_all(false) {}410411/**412* Construct a dependency on the in-order instruction with the provided413* ordered_address instruction counter.414*/415dependency(tgl_regdist_mode mode, const ordered_address &jp,416bool exec_all) :417ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),418exec_all(exec_all) {}419420/**421* Construct a dependency on the out-of-order instruction with the422* specified synchronization token.423*/424dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :425ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),426exec_all(exec_all) {}427428/**429* Synchronization mode of in-order dependency, or zero if no in-order430* dependency is present.431*/432tgl_regdist_mode ordered;433434/**435* Instruction counter of in-order dependency.436*437* For a dependency part of a different block in the program, this is438* relative to the specific control flow path taken between the439* dependency and the current block: It is the ordered_address such that440* the difference between it and the ordered_address of the first441* instruction of the current block is exactly the number of in-order442* instructions across that control flow path. It is not guaranteed to443* be equal to the local ordered_address of the generating instruction444* [as returned by ordered_inst_addresses()], except for block-local445* dependencies.446*/447ordered_address jp;448449/**450* Synchronization mode of unordered dependency, or zero if no unordered451* dependency is present.452*/453tgl_sbid_mode unordered;454455/** Synchronization token of out-of-order dependency. */456unsigned id;457458/**459* Whether the dependency could be run with execution masking disabled,460* which might lead to the unwanted execution of the generating461* instruction in cases where a BB is executed with all channels462* disabled due to hardware bug Wa_1407528679.463*/464bool exec_all;465466/**467* Trivial in-order dependency that's always satisfied.468*469* Note that unlike a default-constructed dependency() which is also470* trivially satisfied, this is considered to provide dependency471* information and can be used to clear a previously pending dependency472* via shadow().473*/474static const dependency done;475476friend bool477operator==(const dependency &dep0, const dependency &dep1)478{479return dep0.ordered == dep1.ordered &&480dep0.jp == dep1.jp &&481dep0.unordered == dep1.unordered &&482dep0.id == dep1.id &&483dep0.exec_all == dep1.exec_all;484}485486friend bool487operator!=(const dependency &dep0, const dependency &dep1)488{489return !(dep0 == dep1);490}491};492493const dependency dependency::done =494dependency(TGL_REGDIST_SRC, ordered_address(), false);495496/**497* Return whether \p dep contains any dependency information.498*/499bool500is_valid(const dependency &dep)501{502return dep.ordered || dep.unordered;503}504505/**506* Combine \p dep0 and \p dep1 into a single dependency object that is only507* satisfied when both original dependencies are satisfied. This might508* involve updating the equivalence relation \p eq in order to make sure509* that both out-of-order dependencies are assigned the same hardware SBID510* as synchronization token.511*/512dependency513merge(equivalence_relation &eq,514const dependency &dep0, const dependency &dep1)515{516dependency dep;517518if (dep0.ordered || dep1.ordered) {519dep.ordered = dep0.ordered | dep1.ordered;520for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)521dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);522}523524if (dep0.unordered || dep1.unordered) {525dep.unordered = dep0.unordered | dep1.unordered;526dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,527dep1.unordered ? dep1.id : dep0.id);528}529530dep.exec_all = dep0.exec_all || dep1.exec_all;531532return dep;533}534535/**536* Override dependency information of \p dep0 with that of \p dep1.537*/538dependency539shadow(const dependency &dep0, const dependency &dep1)540{541return is_valid(dep1) ? dep1 : dep0;542}543544/**545* Translate dependency information across the program.546*547* This returns a dependency on the same instruction translated to the548* ordered_address space of a different block. The correct shift for549* transporting a dependency across an edge of the CFG is the difference550* between the local ordered_address of the first instruction of the target551* block and the local ordered_address of the instruction immediately after552* the end of the origin block.553*/554dependency555transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])556{557if (dep.ordered) {558for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {559if (dep.jp.jp[p] > INT_MIN)560dep.jp.jp[p] += delta[p];561}562}563564return dep;565}566567/**568* Return simplified dependency removing any synchronization modes not569* applicable to an instruction reading the same register location.570*/571dependency572dependency_for_read(dependency dep)573{574dep.ordered &= TGL_REGDIST_DST;575return dep;576}577578/**579* Return simplified dependency removing any synchronization modes not580* applicable to an instruction \p inst writing the same register location.581*582* This clears any WaR dependency for writes performed from the same583* pipeline as the read, since there is no possibility for a data hazard.584*/585dependency586dependency_for_write(const struct intel_device_info *devinfo,587const fs_inst *inst, dependency dep)588{589if (!is_unordered(inst) &&590is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))591dep.ordered &= TGL_REGDIST_DST;592return dep;593}594595/** @} */596597/**598* Scoreboard representation. This keeps track of the data dependencies of599* registers with GRF granularity.600*/601class scoreboard {602public:603/**604* Look up the most current data dependency for register \p r.605*/606dependency607get(const fs_reg &r) const608{609if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))610return *p;611else612return dependency();613}614615/**616* Specify the most current data dependency for register \p r.617*/618void619set(const fs_reg &r, const dependency &d)620{621if (dependency *p = dep(r))622*p = d;623}624625/**626* Component-wise merge() of corresponding dependencies from two627* scoreboard objects. \sa merge().628*/629friend scoreboard630merge(equivalence_relation &eq,631const scoreboard &sb0, const scoreboard &sb1)632{633scoreboard sb;634635for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)636sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);637638sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);639sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);640641return sb;642}643644/**645* Component-wise shadow() of corresponding dependencies from two646* scoreboard objects. \sa shadow().647*/648friend scoreboard649shadow(const scoreboard &sb0, const scoreboard &sb1)650{651scoreboard sb;652653for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)654sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);655656sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);657sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);658659return sb;660}661662/**663* Component-wise transport() of dependencies from a scoreboard664* object. \sa transport().665*/666friend scoreboard667transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])668{669scoreboard sb;670671for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)672sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);673674sb.addr_dep = transport(sb0.addr_dep, delta);675sb.accum_dep = transport(sb0.accum_dep, delta);676677return sb;678}679680friend bool681operator==(const scoreboard &sb0, const scoreboard &sb1)682{683for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {684if (sb0.grf_deps[i] != sb1.grf_deps[i])685return false;686}687688if (sb0.addr_dep != sb1.addr_dep)689return false;690691if (sb0.accum_dep != sb1.accum_dep)692return false;693694return true;695}696697friend bool698operator!=(const scoreboard &sb0, const scoreboard &sb1)699{700return !(sb0 == sb1);701}702703private:704dependency grf_deps[BRW_MAX_GRF];705dependency addr_dep;706dependency accum_dep;707708dependency *709dep(const fs_reg &r)710{711const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :712reg_offset(r) / REG_SIZE);713714return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :715r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :716r.file == ARF && reg >= BRW_ARF_ADDRESS &&717reg < BRW_ARF_ACCUMULATOR ? &addr_dep :718r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&719reg < BRW_ARF_FLAG ? &accum_dep :720NULL);721}722};723724/**725* Dependency list handling.726* @{727*/728struct dependency_list {729dependency_list() : deps(NULL), n(0) {}730731~dependency_list()732{733free(deps);734}735736void737push_back(const dependency &dep)738{739deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));740deps[n++] = dep;741}742743unsigned744size() const745{746return n;747}748749const dependency &750operator[](unsigned i) const751{752assert(i < n);753return deps[i];754}755756dependency &757operator[](unsigned i)758{759assert(i < n);760return deps[i];761}762763private:764dependency_list(const dependency_list &);765dependency_list &766operator=(const dependency_list &);767768dependency *deps;769unsigned n;770};771772/**773* Add dependency \p dep to the list of dependencies of an instruction774* \p deps.775*/776void777add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)778{779if (is_valid(dep)) {780/* Translate the unordered dependency token first in order to keep781* the list minimally redundant.782*/783if (dep.unordered)784dep.id = ids[dep.id];785786/* Try to combine the specified dependency with any existing ones. */787for (unsigned i = 0; i < deps.size(); i++) {788/* Don't combine otherwise matching dependencies if there is an789* exec_all mismatch which would cause a SET dependency to gain an790* exec_all flag, since that would prevent it from being baked791* into the instruction we want to allocate an SBID for.792*/793if (deps[i].exec_all != dep.exec_all &&794(!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&795(!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))796continue;797798if (dep.ordered && deps[i].ordered) {799for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)800deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);801802deps[i].ordered |= dep.ordered;803deps[i].exec_all |= dep.exec_all;804dep.ordered = TGL_REGDIST_NULL;805}806807if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {808deps[i].unordered |= dep.unordered;809deps[i].exec_all |= dep.exec_all;810dep.unordered = TGL_SBID_NULL;811}812}813814/* Add it to the end of the list if necessary. */815if (is_valid(dep))816deps.push_back(dep);817}818}819820/**821* Construct a tgl_swsb annotation encoding any ordered dependencies from822* the dependency list \p deps of an instruction with ordered_address \p823* jp. If \p exec_all is false only dependencies known to be executed with824* channel masking applied will be considered in the calculation.825*/826tgl_swsb827ordered_dependency_swsb(const dependency_list &deps,828const ordered_address &jp,829bool exec_all)830{831tgl_pipe p = TGL_PIPE_NONE;832unsigned min_dist = ~0u;833834for (unsigned i = 0; i < deps.size(); i++) {835if (deps[i].ordered && exec_all >= deps[i].exec_all) {836for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {837const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);838const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);839assert(jp.jp[q] > deps[i].jp.jp[q]);840if (dist <= max_dist) {841p = (p && IDX(p) != q ? TGL_PIPE_ALL :842tgl_pipe(TGL_PIPE_FLOAT + q));843min_dist = MIN3(min_dist, dist, 7);844}845}846}847}848849return { p ? min_dist : 0, p };850}851852/**853* Return whether the dependency list \p deps of an instruction with854* ordered_address \p jp has any non-trivial ordered dependencies. If \p855* exec_all is false only dependencies known to be executed with channel856* masking applied will be considered in the calculation.857*/858bool859find_ordered_dependency(const dependency_list &deps,860const ordered_address &jp,861bool exec_all)862{863return ordered_dependency_swsb(deps, jp, exec_all).regdist;864}865866/**867* Return the full tgl_sbid_mode bitset for the first unordered dependency868* on the list \p deps that matches the specified tgl_sbid_mode, or zero if869* no such dependency is present. If \p exec_all is false only870* dependencies known to be executed with channel masking applied will be871* considered in the calculation.872*/873tgl_sbid_mode874find_unordered_dependency(const dependency_list &deps,875tgl_sbid_mode unordered,876bool exec_all)877{878if (unordered) {879for (unsigned i = 0; i < deps.size(); i++) {880if ((unordered & deps[i].unordered) &&881exec_all >= deps[i].exec_all)882return deps[i].unordered;883}884}885886return TGL_SBID_NULL;887}888889/**890* Return the tgl_sbid_mode bitset of an unordered dependency from the list891* \p deps that can be represented directly in the SWSB annotation of the892* instruction without additional SYNC instructions, or zero if no such893* dependency is present.894*/895tgl_sbid_mode896baked_unordered_dependency_mode(const struct intel_device_info *devinfo,897const fs_inst *inst,898const dependency_list &deps,899const ordered_address &jp)900{901const bool exec_all = inst->force_writemask_all;902const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);903const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,904exec_all).pipe;905906if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))907return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);908else if (has_ordered && is_unordered(inst))909return TGL_SBID_NULL;910else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&911(!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))912return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);913else if (!has_ordered)914return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);915else916return TGL_SBID_NULL;917}918919/**920* Return whether an ordered dependency from the list \p deps can be921* represented directly in the SWSB annotation of the instruction without922* additional SYNC instructions.923*/924bool925baked_ordered_dependency_mode(const struct intel_device_info *devinfo,926const fs_inst *inst,927const dependency_list &deps,928const ordered_address &jp)929{930const bool exec_all = inst->force_writemask_all;931const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);932const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,933exec_all).pipe;934const tgl_sbid_mode unordered_mode =935baked_unordered_dependency_mode(devinfo, inst, deps, jp);936937if (!has_ordered)938return false;939else if (!unordered_mode)940return true;941else942return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&943unordered_mode == (is_unordered(inst) ? TGL_SBID_SET :944TGL_SBID_DST);945}946947/** @} */948949/**950* Shader instruction dependency calculation.951* @{952*/953954/**955* Update scoreboard object \p sb to account for the execution of956* instruction \p inst.957*/958void959update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,960const fs_inst *inst, unsigned ip, scoreboard &sb)961{962const bool exec_all = inst->force_writemask_all;963const struct intel_device_info *devinfo = shader->devinfo;964const tgl_pipe p = inferred_exec_pipe(devinfo, inst);965const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :966ordered_address();967968/* Track any source registers that may be fetched asynchronously by this969* instruction, otherwise clear the dependency in order to avoid970* subsequent redundant synchronization.971*/972for (unsigned i = 0; i < inst->sources; i++) {973const dependency rd_dep =974(inst->is_payload(i) ||975inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :976ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?977dependency(TGL_REGDIST_SRC, jp, exec_all) :978dependency::done;979980for (unsigned j = 0; j < regs_read(inst, i); j++)981sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);982}983984if (inst->reads_accumulator_implicitly())985sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));986987if (is_send(inst) && inst->base_mrf != -1) {988const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);989990for (unsigned j = 0; j < inst->mlen; j++)991sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);992}993994/* Track any destination registers of this instruction. */995const dependency wr_dep =996is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :997ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?998dependency(TGL_REGDIST_DST, jp, exec_all) :999dependency();10001001if (inst->writes_accumulator_implicitly(devinfo))1002sb.set(brw_acc_reg(8), wr_dep);10031004if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&1005!inst->dst.is_null()) {1006for (unsigned j = 0; j < regs_written(inst); j++)1007sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);1008}1009}10101011/**1012* Calculate scoreboard objects locally that represent any pending (and1013* unconditionally resolved) dependencies at the end of each block of the1014* program.1015*/1016scoreboard *1017gather_block_scoreboards(const fs_visitor *shader,1018const ordered_address *jps)1019{1020scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];1021unsigned ip = 0;10221023foreach_block_and_inst(block, fs_inst, inst, shader->cfg)1024update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);10251026return sbs;1027}10281029/**1030* Propagate data dependencies globally through the control flow graph1031* until a fixed point is reached.1032*1033* Calculates the set of dependencies potentially pending at the beginning1034* of each block, and returns it as an array of scoreboard objects.1035*/1036scoreboard *1037propagate_block_scoreboards(const fs_visitor *shader,1038const ordered_address *jps,1039equivalence_relation &eq)1040{1041const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);1042scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];1043scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];10441045for (bool progress = true; progress;) {1046progress = false;10471048foreach_block(block, shader->cfg) {1049const scoreboard sb = shadow(in_sbs[block->num],1050delta_sbs[block->num]);10511052if (sb != out_sbs[block->num]) {1053foreach_list_typed(bblock_link, child_link, link,1054&block->children) {1055scoreboard &in_sb = in_sbs[child_link->block->num];1056int delta[IDX(TGL_PIPE_ALL)];10571058for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)1059delta[p] = jps[child_link->block->start_ip].jp[p]1060- jps[block->end_ip].jp[p]1061- ordered_unit(shader->devinfo,1062static_cast<const fs_inst *>(block->end()), p);10631064in_sb = merge(eq, in_sb, transport(sb, delta));1065}10661067out_sbs[block->num] = sb;1068progress = true;1069}1070}1071}10721073delete[] delta_sbs;1074delete[] out_sbs;10751076return in_sbs;1077}10781079/**1080* Return the list of potential dependencies of each instruction in the1081* shader based on the result of global dependency analysis.1082*/1083dependency_list *1084gather_inst_dependencies(const fs_visitor *shader,1085const ordered_address *jps)1086{1087const struct intel_device_info *devinfo = shader->devinfo;1088equivalence_relation eq(num_instructions(shader));1089scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);1090const unsigned *ids = eq.flatten();1091dependency_list *deps = new dependency_list[num_instructions(shader)];1092unsigned ip = 0;10931094foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {1095const bool exec_all = inst->force_writemask_all;1096const tgl_pipe p = inferred_exec_pipe(devinfo, inst);1097scoreboard &sb = sbs[block->num];10981099for (unsigned i = 0; i < inst->sources; i++) {1100for (unsigned j = 0; j < regs_read(inst, i); j++)1101add_dependency(ids, deps[ip], dependency_for_read(1102sb.get(byte_offset(inst->src[i], REG_SIZE * j))));1103}11041105if (inst->reads_accumulator_implicitly()) {1106/* Wa_22012725308:1107*1108* "When the accumulator registers are used as source and/or1109* destination, hardware does not ensure prevention of write1110* after read hazard across execution pipes."1111*/1112const dependency dep = sb.get(brw_acc_reg(8));1113if (dep.ordered && !is_single_pipe(dep.jp, p))1114add_dependency(ids, deps[ip], dep);1115}11161117if (is_send(inst) && inst->base_mrf != -1) {1118for (unsigned j = 0; j < inst->mlen; j++)1119add_dependency(ids, deps[ip], dependency_for_read(1120sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));1121}11221123if (is_unordered(inst))1124add_dependency(ids, deps[ip],1125dependency(TGL_SBID_SET, ip, exec_all));11261127if (!inst->no_dd_check) {1128if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&1129!inst->dst.is_accumulator()) {1130for (unsigned j = 0; j < regs_written(inst); j++) {1131add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,1132sb.get(byte_offset(inst->dst, REG_SIZE * j))));1133}1134}11351136if (inst->writes_accumulator_implicitly(devinfo) ||1137inst->dst.is_accumulator()) {1138/* Wa_22012725308:1139*1140* "When the accumulator registers are used as source and/or1141* destination, hardware does not ensure prevention of write1142* after read hazard across execution pipes."1143*/1144const dependency dep = sb.get(brw_acc_reg(8));1145if (dep.ordered && !is_single_pipe(dep.jp, p))1146add_dependency(ids, deps[ip], dep);1147}11481149if (is_send(inst) && inst->base_mrf != -1) {1150for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)1151add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,1152sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));1153}1154}11551156update_inst_scoreboard(shader, jps, inst, ip, sb);1157ip++;1158}11591160delete[] sbs;1161delete[] ids;11621163return deps;1164}11651166/** @} */11671168/**1169* Allocate SBID tokens to track the execution of every out-of-order1170* instruction of the shader.1171*/1172dependency_list *1173allocate_inst_dependencies(const fs_visitor *shader,1174const dependency_list *deps0)1175{1176/* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in1177* shaders with a large number of SEND messages.1178*/11791180/* Allocate an unordered dependency ID to hardware SBID translation1181* table with as many entries as instructions there are in the shader,1182* which is the maximum number of unordered IDs we can find in the1183* program.1184*/1185unsigned *ids = new unsigned[num_instructions(shader)];1186for (unsigned ip = 0; ip < num_instructions(shader); ip++)1187ids[ip] = ~0u;11881189dependency_list *deps1 = new dependency_list[num_instructions(shader)];1190unsigned next_id = 0;11911192for (unsigned ip = 0; ip < num_instructions(shader); ip++) {1193for (unsigned i = 0; i < deps0[ip].size(); i++) {1194const dependency &dep = deps0[ip][i];11951196if (dep.unordered && ids[dep.id] == ~0u)1197ids[dep.id] = (next_id++) & 0xf;11981199add_dependency(ids, deps1[ip], dep);1200}1201}12021203delete[] ids;12041205return deps1;1206}12071208/**1209* Emit dependency information provided by \p deps into the shader,1210* inserting additional SYNC instructions for dependencies that can't be1211* represented directly by annotating existing instructions.1212*/1213void1214emit_inst_dependencies(fs_visitor *shader,1215const ordered_address *jps,1216const dependency_list *deps)1217{1218const struct intel_device_info *devinfo = shader->devinfo;1219unsigned ip = 0;12201221foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {1222const bool exec_all = inst->force_writemask_all;1223const bool ordered_mode =1224baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);1225const tgl_sbid_mode unordered_mode =1226baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);1227tgl_swsb swsb = !ordered_mode ? tgl_swsb() :1228ordered_dependency_swsb(deps[ip], jps[ip], exec_all);12291230for (unsigned i = 0; i < deps[ip].size(); i++) {1231const dependency &dep = deps[ip][i];12321233if (dep.unordered) {1234if (unordered_mode == dep.unordered &&1235exec_all >= dep.exec_all && !swsb.mode) {1236/* Bake unordered dependency into the instruction's SWSB if1237* possible, except in cases where the current instruction1238* isn't marked NoMask but the dependency is, since that1239* might lead to data coherency issues due to1240* Wa_1407528679.1241*/1242swsb.sbid = dep.id;1243swsb.mode = dep.unordered;1244} else {1245/* Emit dependency into the SWSB of an extra SYNC1246* instruction.1247*/1248const fs_builder ibld = fs_builder(shader, block, inst)1249.exec_all().group(1, 0);1250fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),1251brw_imm_ud(TGL_SYNC_NOP));1252sync->sched.sbid = dep.id;1253sync->sched.mode = dep.unordered;1254assert(!(sync->sched.mode & TGL_SBID_SET));1255}1256}1257}12581259for (unsigned i = 0; i < deps[ip].size(); i++) {1260const dependency &dep = deps[ip][i];12611262if (dep.ordered &&1263find_ordered_dependency(deps[ip], jps[ip], true) &&1264(!ordered_mode || dep.exec_all > exec_all)) {1265/* If the current instruction is not marked NoMask but an1266* ordered dependency is, perform the synchronization as a1267* separate NoMask SYNC instruction in order to avoid data1268* coherency issues due to Wa_1407528679. The similar1269* scenario with unordered dependencies should have been1270* handled above.1271*/1272const fs_builder ibld = fs_builder(shader, block, inst)1273.exec_all().group(1, 0);1274fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),1275brw_imm_ud(TGL_SYNC_NOP));1276sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);1277break;1278}1279}12801281/* Update the IR. */1282inst->sched = swsb;1283inst->no_dd_check = inst->no_dd_clear = false;1284ip++;1285}1286}1287}12881289bool1290fs_visitor::lower_scoreboard()1291{1292if (devinfo->ver >= 12) {1293const ordered_address *jps = ordered_inst_addresses(this);1294const dependency_list *deps0 = gather_inst_dependencies(this, jps);1295const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);1296emit_inst_dependencies(this, jps, deps1);1297delete[] deps1;1298delete[] deps0;1299delete[] jps;1300}13011302return true;1303}130413051306