Path: blob/21.2-virgl/src/amd/compiler/aco_insert_waitcnt.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_ir.h"2526#include "common/sid.h"2728#include <map>29#include <stack>30#include <vector>3132namespace aco {3334namespace {3536/**37* The general idea of this pass is:38* The CFG is traversed in reverse postorder (forward) and loops are processed39* several times until no progress is made.40* Per BB two wait_ctx is maintained: an in-context and out-context.41* The in-context is the joined out-contexts of the predecessors.42* The context contains a map: gpr -> wait_entry43* consisting of the information about the cnt values to be waited for.44* Note: After merge-nodes, it might occur that for the same register45* multiple cnt values are to be waited for.46*47* The values are updated according to the encountered instructions:48* - additional events increment the counter of waits of the same type49* - or erase gprs with counters higher than to be waited for.50*/5152// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)53// when there is a load followed by a use of a previous load5455/* Instructions of the same event will finish in-order except for smem56* and maybe flat. Instructions of different events may not finish in-order. */57enum wait_event : uint16_t {58event_smem = 1 << 0,59event_lds = 1 << 1,60event_gds = 1 << 2,61event_vmem = 1 << 3,62event_vmem_store = 1 << 4, /* GFX10+ */63event_flat = 1 << 5,64event_exp_pos = 1 << 6,65event_exp_param = 1 << 7,66event_exp_mrt_null = 1 << 8,67event_gds_gpr_lock = 1 << 9,68event_vmem_gpr_lock = 1 << 10,69event_sendmsg = 1 << 11,70num_events = 12,71};7273enum counter_type : uint8_t {74counter_exp = 1 << 0,75counter_lgkm = 1 << 1,76counter_vm = 1 << 2,77counter_vs = 1 << 3,78num_counters = 4,79};8081static const uint16_t exp_events =82event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;83static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;84static const uint16_t vm_events = event_vmem | event_flat;85static const uint16_t vs_events = event_vmem_store;8687uint8_t88get_counters_for_event(wait_event ev)89{90switch (ev) {91case event_smem:92case event_lds:93case event_gds:94case event_sendmsg: return counter_lgkm;95case event_vmem: return counter_vm;96case event_vmem_store: return counter_vs;97case event_flat: return counter_vm | counter_lgkm;98case event_exp_pos:99case event_exp_param:100case event_exp_mrt_null:101case event_gds_gpr_lock:102case event_vmem_gpr_lock: return counter_exp;103default: return 0;104}105}106107struct wait_entry {108wait_imm imm;109uint16_t events; /* use wait_event notion */110uint8_t counters; /* use counter_type notion */111bool wait_on_read : 1;112bool logical : 1;113bool has_vmem_nosampler : 1;114bool has_vmem_sampler : 1;115116wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)117: imm(imm_), events(event_), counters(get_counters_for_event(event_)),118wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),119has_vmem_sampler(false)120{}121122bool join(const wait_entry& other)123{124bool changed = (other.events & ~events) || (other.counters & ~counters) ||125(other.wait_on_read && !wait_on_read) ||126(other.has_vmem_nosampler && !has_vmem_nosampler) ||127(other.has_vmem_sampler && !has_vmem_sampler);128events |= other.events;129counters |= other.counters;130changed |= imm.combine(other.imm);131wait_on_read |= other.wait_on_read;132has_vmem_nosampler |= other.has_vmem_nosampler;133has_vmem_sampler |= other.has_vmem_sampler;134assert(logical == other.logical);135return changed;136}137138void remove_counter(counter_type counter)139{140counters &= ~counter;141142if (counter == counter_lgkm) {143imm.lgkm = wait_imm::unset_counter;144events &= ~(event_smem | event_lds | event_gds | event_sendmsg);145}146147if (counter == counter_vm) {148imm.vm = wait_imm::unset_counter;149events &= ~event_vmem;150has_vmem_nosampler = false;151has_vmem_sampler = false;152}153154if (counter == counter_exp) {155imm.exp = wait_imm::unset_counter;156events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |157event_vmem_gpr_lock);158}159160if (counter == counter_vs) {161imm.vs = wait_imm::unset_counter;162events &= ~event_vmem_store;163}164165if (!(counters & counter_lgkm) && !(counters & counter_vm))166events &= ~event_flat;167}168};169170struct wait_ctx {171Program* program;172enum chip_class chip_class;173uint16_t max_vm_cnt;174uint16_t max_exp_cnt;175uint16_t max_lgkm_cnt;176uint16_t max_vs_cnt;177uint16_t unordered_events = event_smem | event_flat;178179uint8_t vm_cnt = 0;180uint8_t exp_cnt = 0;181uint8_t lgkm_cnt = 0;182uint8_t vs_cnt = 0;183bool pending_flat_lgkm = false;184bool pending_flat_vm = false;185bool pending_s_buffer_store = false; /* GFX10 workaround */186187wait_imm barrier_imm[storage_count];188uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */189190std::map<PhysReg, wait_entry> gpr_map;191192wait_ctx() {}193wait_ctx(Program* program_)194: program(program_), chip_class(program_->chip_class),195max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),196max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),197max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),198unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))199{}200201bool join(const wait_ctx* other, bool logical)202{203bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||204other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||205(other->pending_flat_lgkm && !pending_flat_lgkm) ||206(other->pending_flat_vm && !pending_flat_vm);207208exp_cnt = std::max(exp_cnt, other->exp_cnt);209vm_cnt = std::max(vm_cnt, other->vm_cnt);210lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);211vs_cnt = std::max(vs_cnt, other->vs_cnt);212pending_flat_lgkm |= other->pending_flat_lgkm;213pending_flat_vm |= other->pending_flat_vm;214pending_s_buffer_store |= other->pending_s_buffer_store;215216for (const auto& entry : other->gpr_map) {217if (entry.second.logical != logical)218continue;219220using iterator = std::map<PhysReg, wait_entry>::iterator;221const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);222if (insert_pair.second) {223changed = true;224} else {225changed |= insert_pair.first->second.join(entry.second);226}227}228229for (unsigned i = 0; i < storage_count; i++) {230changed |= barrier_imm[i].combine(other->barrier_imm[i]);231changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;232barrier_events[i] |= other->barrier_events[i];233}234235return changed;236}237238void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)239{240entry.remove_counter(counter);241}242};243244void245check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)246{247for (const Operand op : instr->operands) {248if (op.isConstant() || op.isUndefined())249continue;250251/* check consecutively read gprs */252for (unsigned j = 0; j < op.size(); j++) {253PhysReg reg{op.physReg() + j};254std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);255if (it == ctx.gpr_map.end() || !it->second.wait_on_read)256continue;257258wait.combine(it->second.imm);259}260}261262for (const Definition& def : instr->definitions) {263/* check consecutively written gprs */264for (unsigned j = 0; j < def.getTemp().size(); j++) {265PhysReg reg{def.physReg() + j};266267std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);268if (it == ctx.gpr_map.end())269continue;270271/* Vector Memory reads and writes return in the order they were issued */272bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&273instr->operands[1].regClass() == s4;274if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&275it->second.has_vmem_nosampler == !has_sampler &&276it->second.has_vmem_sampler == has_sampler)277continue;278279/* LDS reads and writes return in the order they were issued. same for GDS */280if (instr->isDS() &&281(it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))282continue;283284wait.combine(it->second.imm);285}286}287}288289bool290parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)291{292if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&293instr->definitions[0].physReg() == sgpr_null) {294imm.vs = std::min<uint8_t>(imm.vs, instr->sopk().imm);295return true;296} else if (instr->opcode == aco_opcode::s_waitcnt) {297imm.combine(wait_imm(ctx.chip_class, instr->sopp().imm));298return true;299}300return false;301}302303void304perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)305{306sync_scope subgroup_scope =307ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;308if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {309unsigned storage = sync.storage;310while (storage) {311unsigned idx = u_bit_scan(&storage);312313/* LDS is private to the workgroup */314sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);315316uint16_t events = ctx.barrier_events[idx];317if (bar_scope_lds <= subgroup_scope)318events &= ~event_lds;319320/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations321* in-order for the same workgroup */322if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)323events &= ~(event_vmem | event_vmem_store | event_smem);324325if (events)326imm.combine(ctx.barrier_imm[idx]);327}328}329}330331void332force_waitcnt(wait_ctx& ctx, wait_imm& imm)333{334if (ctx.vm_cnt)335imm.vm = 0;336if (ctx.exp_cnt)337imm.exp = 0;338if (ctx.lgkm_cnt)339imm.lgkm = 0;340341if (ctx.chip_class >= GFX10) {342if (ctx.vs_cnt)343imm.vs = 0;344}345}346347void348kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)349{350if (debug_flags & DEBUG_FORCE_WAITCNT) {351/* Force emitting waitcnt states right after the instruction if there is352* something to wait for.353*/354return force_waitcnt(ctx, imm);355}356357if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)358check_instr(ctx, imm, instr);359360/* It's required to wait for scalar stores before "writing back" data.361* It shouldn't cost anything anyways since we're about to do s_endpgm.362*/363if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) {364assert(ctx.chip_class >= GFX8);365imm.lgkm = 0;366}367368if (ctx.chip_class >= GFX10 && instr->isSMEM()) {369/* GFX10: A store followed by a load at the same address causes a problem because370* the load doesn't load the correct values unless we wait for the store first.371* This is NOT mitigated by an s_nop.372*373* TODO: Refine this when we have proper alias analysis.374*/375if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&376!instr->smem().sync.can_reorder()) {377imm.lgkm = 0;378}379}380381if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {382if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {383384/* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos385* export. Wait for all stores (and atomics) to complete, so PS can read them.386* TODO: This only really applies to DONE pos exports.387* Consider setting the DONE bit earlier.388*/389if (ctx.vs_cnt > 0)390imm.vs = 0;391if (ctx.vm_cnt > 0)392imm.vm = 0;393}394}395396if (instr->opcode == aco_opcode::p_barrier)397perform_barrier(ctx, imm, instr->barrier().sync, semantic_acqrel);398else399perform_barrier(ctx, imm, sync_info, semantic_release);400401if (!imm.empty()) {402if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)403imm.vm = 0;404if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)405imm.lgkm = 0;406407/* reset counters */408ctx.exp_cnt = std::min(ctx.exp_cnt, imm.exp);409ctx.vm_cnt = std::min(ctx.vm_cnt, imm.vm);410ctx.lgkm_cnt = std::min(ctx.lgkm_cnt, imm.lgkm);411ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);412413/* update barrier wait imms */414for (unsigned i = 0; i < storage_count; i++) {415wait_imm& bar = ctx.barrier_imm[i];416uint16_t& bar_ev = ctx.barrier_events[i];417if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {418bar.exp = wait_imm::unset_counter;419bar_ev &= ~exp_events;420}421if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {422bar.vm = wait_imm::unset_counter;423bar_ev &= ~(vm_events & ~event_flat);424}425if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {426bar.lgkm = wait_imm::unset_counter;427bar_ev &= ~(lgkm_events & ~event_flat);428}429if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {430bar.vs = wait_imm::unset_counter;431bar_ev &= ~vs_events;432}433if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)434bar_ev &= ~event_flat;435}436437/* remove all gprs with higher counter from map */438std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();439while (it != ctx.gpr_map.end()) {440if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)441ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);442if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)443ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);444if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)445ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);446if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)447ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);448if (!it->second.counters)449it = ctx.gpr_map.erase(it);450else451it++;452}453}454455if (imm.vm == 0)456ctx.pending_flat_vm = false;457if (imm.lgkm == 0) {458ctx.pending_flat_lgkm = false;459ctx.pending_s_buffer_store = false;460}461}462463void464update_barrier_counter(uint8_t* ctr, unsigned max)465{466if (*ctr != wait_imm::unset_counter && *ctr < max)467(*ctr)++;468}469470void471update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)472{473for (unsigned i = 0; i < storage_count; i++) {474wait_imm& bar = ctx.barrier_imm[i];475uint16_t& bar_ev = ctx.barrier_events[i];476if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {477bar_ev |= event;478if (counters & counter_lgkm)479bar.lgkm = 0;480if (counters & counter_vm)481bar.vm = 0;482if (counters & counter_exp)483bar.exp = 0;484if (counters & counter_vs)485bar.vs = 0;486} else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {487if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)488update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);489if (counters & counter_vm && (bar_ev & vm_events) == event)490update_barrier_counter(&bar.vm, ctx.max_vm_cnt);491if (counters & counter_exp && (bar_ev & exp_events) == event)492update_barrier_counter(&bar.exp, ctx.max_exp_cnt);493if (counters & counter_vs && (bar_ev & vs_events) == event)494update_barrier_counter(&bar.vs, ctx.max_vs_cnt);495}496}497}498499void500update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())501{502uint8_t counters = get_counters_for_event(event);503504if (counters & counter_lgkm && ctx.lgkm_cnt <= ctx.max_lgkm_cnt)505ctx.lgkm_cnt++;506if (counters & counter_vm && ctx.vm_cnt <= ctx.max_vm_cnt)507ctx.vm_cnt++;508if (counters & counter_exp && ctx.exp_cnt <= ctx.max_exp_cnt)509ctx.exp_cnt++;510if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)511ctx.vs_cnt++;512513update_barrier_imm(ctx, counters, event, sync);514515if (ctx.unordered_events & event)516return;517518if (ctx.pending_flat_lgkm)519counters &= ~counter_lgkm;520if (ctx.pending_flat_vm)521counters &= ~counter_vm;522523for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {524wait_entry& entry = e.second;525526if (entry.events & ctx.unordered_events)527continue;528529assert(entry.events);530531if ((counters & counter_exp) && (entry.events & exp_events) == event &&532entry.imm.exp < ctx.max_exp_cnt)533entry.imm.exp++;534if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&535entry.imm.lgkm < ctx.max_lgkm_cnt)536entry.imm.lgkm++;537if ((counters & counter_vm) && (entry.events & vm_events) == event &&538entry.imm.vm < ctx.max_vm_cnt)539entry.imm.vm++;540if ((counters & counter_vs) && (entry.events & vs_events) == event &&541entry.imm.vs < ctx.max_vs_cnt)542entry.imm.vs++;543}544}545546void547update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())548{549assert(ctx.chip_class < GFX10);550551if (ctx.lgkm_cnt <= ctx.max_lgkm_cnt)552ctx.lgkm_cnt++;553if (ctx.vm_cnt <= ctx.max_vm_cnt)554ctx.vm_cnt++;555556update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);557558for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {559if (e.second.counters & counter_vm)560e.second.imm.vm = 0;561if (e.second.counters & counter_lgkm)562e.second.imm.lgkm = 0;563}564ctx.pending_flat_lgkm = true;565ctx.pending_flat_vm = true;566}567568void569insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,570bool has_sampler = false)571{572uint16_t counters = get_counters_for_event(event);573wait_imm imm;574if (counters & counter_lgkm)575imm.lgkm = 0;576if (counters & counter_vm)577imm.vm = 0;578if (counters & counter_exp)579imm.exp = 0;580if (counters & counter_vs)581imm.vs = 0;582583wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read);584new_entry.has_vmem_nosampler = (event & event_vmem) && !has_sampler;585new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;586587for (unsigned i = 0; i < rc.size(); i++) {588auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);589if (!it.second)590it.first->second.join(new_entry);591}592}593594void595insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)596{597if (!op.isConstant() && !op.isUndefined())598insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);599}600601void602insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)603{604insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);605}606607void608gen(Instruction* instr, wait_ctx& ctx)609{610switch (instr->format) {611case Format::EXP: {612Export_instruction& exp_instr = instr->exp();613614wait_event ev;615if (exp_instr.dest <= 9)616ev = event_exp_mrt_null;617else if (exp_instr.dest <= 15)618ev = event_exp_pos;619else620ev = event_exp_param;621update_counters(ctx, ev);622623/* insert new entries for exported vgprs */624for (unsigned i = 0; i < 4; i++) {625if (exp_instr.enabled_mask & (1 << i)) {626unsigned idx = exp_instr.compressed ? i >> 1 : i;627assert(idx < exp_instr.operands.size());628insert_wait_entry(ctx, exp_instr.operands[idx], ev);629}630}631insert_wait_entry(ctx, exec, s2, ev, false);632break;633}634case Format::FLAT: {635FLAT_instruction& flat = instr->flat();636if (ctx.chip_class < GFX10 && !instr->definitions.empty())637update_counters_for_flat_load(ctx, flat.sync);638else639update_counters(ctx, event_flat, flat.sync);640641if (!instr->definitions.empty())642insert_wait_entry(ctx, instr->definitions[0], event_flat);643break;644}645case Format::SMEM: {646SMEM_instruction& smem = instr->smem();647update_counters(ctx, event_smem, smem.sync);648649if (!instr->definitions.empty())650insert_wait_entry(ctx, instr->definitions[0], event_smem);651else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())652ctx.pending_s_buffer_store = true;653654break;655}656case Format::DS: {657DS_instruction& ds = instr->ds();658update_counters(ctx, ds.gds ? event_gds : event_lds, ds.sync);659if (ds.gds)660update_counters(ctx, event_gds_gpr_lock);661662if (!instr->definitions.empty())663insert_wait_entry(ctx, instr->definitions[0], ds.gds ? event_gds : event_lds);664665if (ds.gds) {666for (const Operand& op : instr->operands)667insert_wait_entry(ctx, op, event_gds_gpr_lock);668insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);669}670break;671}672case Format::MUBUF:673case Format::MTBUF:674case Format::MIMG:675case Format::GLOBAL: {676wait_event ev =677!instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;678update_counters(ctx, ev, get_sync_info(instr));679680bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&681instr->operands[1].regClass() == s4;682683if (!instr->definitions.empty())684insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);685686if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {687ctx.exp_cnt++;688update_counters(ctx, event_vmem_gpr_lock);689insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);690} else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {691ctx.exp_cnt++;692update_counters(ctx, event_vmem_gpr_lock);693insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);694}695696break;697}698case Format::SOPP: {699if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)700update_counters(ctx, event_sendmsg);701break;702}703default: break;704}705}706707void708emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm& imm)709{710if (imm.vs != wait_imm::unset_counter) {711assert(ctx.chip_class >= GFX10);712SOPK_instruction* waitcnt_vs =713create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);714waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);715waitcnt_vs->imm = imm.vs;716instructions.emplace_back(waitcnt_vs);717imm.vs = wait_imm::unset_counter;718}719if (!imm.empty()) {720SOPP_instruction* waitcnt =721create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);722waitcnt->imm = imm.pack(ctx.chip_class);723waitcnt->block = -1;724instructions.emplace_back(waitcnt);725}726imm = wait_imm();727}728729void730handle_block(Program* program, Block& block, wait_ctx& ctx)731{732std::vector<aco_ptr<Instruction>> new_instructions;733734wait_imm queued_imm;735736for (aco_ptr<Instruction>& instr : block.instructions) {737bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());738739memory_sync_info sync_info = get_sync_info(instr.get());740kill(queued_imm, instr.get(), ctx, sync_info);741742gen(instr.get(), ctx);743744if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {745if (!queued_imm.empty())746emit_waitcnt(ctx, new_instructions, queued_imm);747748new_instructions.emplace_back(std::move(instr));749perform_barrier(ctx, queued_imm, sync_info, semantic_acquire);750}751}752753if (!queued_imm.empty())754emit_waitcnt(ctx, new_instructions, queued_imm);755756block.instructions.swap(new_instructions);757}758759} /* end namespace */760761void762insert_wait_states(Program* program)763{764/* per BB ctx */765std::vector<bool> done(program->blocks.size());766std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));767std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));768769std::stack<unsigned> loop_header_indices;770unsigned loop_progress = 0;771772for (unsigned i = 0; i < program->blocks.size();) {773Block& current = program->blocks[i++];774wait_ctx ctx = in_ctx[current.index];775776if (current.kind & block_kind_loop_header) {777loop_header_indices.push(current.index);778} else if (current.kind & block_kind_loop_exit) {779bool repeat = false;780if (loop_progress == loop_header_indices.size()) {781i = loop_header_indices.top();782repeat = true;783}784loop_header_indices.pop();785loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());786if (repeat)787continue;788}789790bool changed = false;791for (unsigned b : current.linear_preds)792changed |= ctx.join(&out_ctx[b], false);793for (unsigned b : current.logical_preds)794changed |= ctx.join(&out_ctx[b], true);795796if (done[current.index] && !changed) {797in_ctx[current.index] = std::move(ctx);798continue;799} else {800in_ctx[current.index] = ctx;801}802803if (current.instructions.empty()) {804out_ctx[current.index] = std::move(ctx);805continue;806}807808loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);809done[current.index] = true;810811handle_block(program, current, ctx);812813out_ctx[current.index] = std::move(ctx);814}815}816817} // namespace aco818819820