Path: blob/21.2-virgl/src/amd/compiler/aco_statistics.cpp
4550 views
/*1* Copyright © 2020 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_ir.h"2526#include "util/crc32.h"2728#include <algorithm>29#include <deque>30#include <set>31#include <vector>3233namespace aco {3435/* sgpr_presched/vgpr_presched */36void37collect_presched_stats(Program* program)38{39RegisterDemand presched_demand;40for (Block& block : program->blocks)41presched_demand.update(block.register_demand);42program->statistics[statistic_sgpr_presched] = presched_demand.sgpr;43program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;44}4546class BlockCycleEstimator {47public:48enum resource {49null = 0,50scalar,51branch_sendmsg,52valu,53valu_complex,54lds,55export_gds,56vmem,57resource_count,58};5960BlockCycleEstimator(Program* program_) : program(program_) {}6162Program* program;6364int32_t cur_cycle = 0;65int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};66unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};67int32_t reg_available[512] = {0};68std::deque<int32_t> lgkm;69std::deque<int32_t> exp;70std::deque<int32_t> vm;71std::deque<int32_t> vs;7273unsigned predict_cost(aco_ptr<Instruction>& instr);74void add(aco_ptr<Instruction>& instr);75void join(const BlockCycleEstimator& other);7677private:78unsigned get_waitcnt_cost(wait_imm imm);79unsigned get_dependency_cost(aco_ptr<Instruction>& instr);8081void use_resources(aco_ptr<Instruction>& instr);82int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);83};8485struct wait_counter_info {86wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)87: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)88{}8990unsigned vm;91unsigned exp;92unsigned lgkm;93unsigned vs;94};9596struct perf_info {97int latency;9899BlockCycleEstimator::resource rsrc0;100unsigned cost0;101102BlockCycleEstimator::resource rsrc1;103unsigned cost1;104};105106static perf_info107get_perf_info(Program* program, aco_ptr<Instruction>& instr)108{109instr_class cls = instr_info.classes[(int)instr->opcode];110111#define WAIT(res) BlockCycleEstimator::res, 0112#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt113114if (program->chip_class >= GFX10) {115/* fp64 might be incorrect */116switch (cls) {117case instr_class::valu32:118case instr_class::valu_convert32:119case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};120case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};121case instr_class::valu_quarter_rate32:122return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};123case instr_class::valu_transcendental32:124return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};125case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};126case instr_class::valu_double_add:127return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};128case instr_class::valu_double_convert:129return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};130case instr_class::valu_double_transcendental:131return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};132case instr_class::salu: return {2, WAIT_USE(scalar, 1)};133case instr_class::smem: return {0, WAIT_USE(scalar, 1)};134case instr_class::branch:135case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};136case instr_class::ds:137return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}138: perf_info{0, WAIT_USE(lds, 1)};139case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};140case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};141case instr_class::barrier:142case instr_class::waitcnt:143case instr_class::other:144default: return {0};145}146} else {147switch (cls) {148case instr_class::valu32: return {4, WAIT_USE(valu, 4)};149case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};150case instr_class::valu64: return {8, WAIT_USE(valu, 8)};151case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};152case instr_class::valu_fma:153return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}154: perf_info{16, WAIT_USE(valu, 16)};155case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};156case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};157case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};158case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};159case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};160case instr_class::salu: return {4, WAIT_USE(scalar, 4)};161case instr_class::smem: return {4, WAIT_USE(scalar, 4)};162case instr_class::branch:163return {8, WAIT_USE(branch_sendmsg, 8)};164return {4, WAIT_USE(branch_sendmsg, 4)};165case instr_class::ds:166return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}167: perf_info{4, WAIT_USE(lds, 4)};168case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};169case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};170case instr_class::barrier:171case instr_class::waitcnt:172case instr_class::other:173default: return {4};174}175}176177#undef WAIT_USE178#undef WAIT179}180181void182BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)183{184perf_info perf = get_perf_info(program, instr);185186if (perf.rsrc0 != resource_count) {187res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;188res_usage[(int)perf.rsrc0] += perf.cost0;189}190191if (perf.rsrc1 != resource_count) {192res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;193res_usage[(int)perf.rsrc1] += perf.cost1;194}195}196197int32_t198BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)199{200perf_info perf = get_perf_info(program, instr);201202int32_t cost = 0;203if (perf.rsrc0 != resource_count)204cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);205if (perf.rsrc1 != resource_count)206cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);207208return cost;209}210211static wait_counter_info212get_wait_counter_info(aco_ptr<Instruction>& instr)213{214/* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance215* depends a lot on the situation. */216217if (instr->isEXP())218return wait_counter_info(0, 16, 0, 0);219220if (instr->isFlatLike()) {221unsigned lgkm = instr->isFlat() ? 20 : 0;222if (!instr->definitions.empty())223return wait_counter_info(230, 0, lgkm, 0);224else225return wait_counter_info(0, 0, lgkm, 230);226}227228if (instr->isSMEM()) {229if (instr->definitions.empty())230return wait_counter_info(0, 0, 200, 0);231if (instr->operands.empty()) /* s_memtime and s_memrealtime */232return wait_counter_info(0, 0, 1, 0);233234bool likely_desc_load = instr->operands[0].size() == 2;235bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);236bool const_offset =237instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());238239if (likely_desc_load || const_offset)240return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */241242return wait_counter_info(0, 0, 200, 0);243}244245if (instr->format == Format::DS)246return wait_counter_info(0, 0, 20, 0);247248if (instr->isVMEM() && !instr->definitions.empty())249return wait_counter_info(320, 0, 0, 0);250251if (instr->isVMEM() && instr->definitions.empty())252return wait_counter_info(0, 0, 0, 320);253254return wait_counter_info(0, 0, 0, 0);255}256257static wait_imm258get_wait_imm(Program* program, aco_ptr<Instruction>& instr)259{260if (instr->opcode == aco_opcode::s_endpgm) {261return wait_imm(0, 0, 0, 0);262} else if (instr->opcode == aco_opcode::s_waitcnt) {263return wait_imm(GFX10_3, instr->sopp().imm);264} else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {265return wait_imm(0, 0, 0, instr->sopk().imm);266} else {267unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;268unsigned max_exp_cnt = 6;269unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;270unsigned max_vs_cnt = 62;271272wait_counter_info wait_info = get_wait_counter_info(instr);273wait_imm imm;274imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;275imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;276imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;277imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;278return imm;279}280}281282unsigned283BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)284{285int deps_available = cur_cycle;286287wait_imm imm = get_wait_imm(program, instr);288if (imm.vm != wait_imm::unset_counter) {289for (int i = 0; i < (int)vm.size() - imm.vm; i++)290deps_available = MAX2(deps_available, vm[i]);291}292if (imm.exp != wait_imm::unset_counter) {293for (int i = 0; i < (int)exp.size() - imm.exp; i++)294deps_available = MAX2(deps_available, exp[i]);295}296if (imm.lgkm != wait_imm::unset_counter) {297for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)298deps_available = MAX2(deps_available, lgkm[i]);299}300if (imm.vs != wait_imm::unset_counter) {301for (int i = 0; i < (int)vs.size() - imm.vs; i++)302deps_available = MAX2(deps_available, vs[i]);303}304305if (instr->opcode == aco_opcode::s_endpgm) {306for (unsigned i = 0; i < 512; i++)307deps_available = MAX2(deps_available, reg_available[i]);308} else if (program->chip_class >= GFX10) {309for (Operand& op : instr->operands) {310if (op.isConstant() || op.isUndefined())311continue;312for (unsigned i = 0; i < op.size(); i++)313deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);314}315}316317if (program->chip_class < GFX10)318deps_available = align(deps_available, 4);319320return deps_available - cur_cycle;321}322323unsigned324BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)325{326int32_t dep = get_dependency_cost(instr);327return dep + std::max(cycles_until_res_available(instr) - dep, 0);328}329330static bool331is_vector(aco_opcode op)332{333switch (instr_info.classes[(int)op]) {334case instr_class::valu32:335case instr_class::valu_convert32:336case instr_class::valu_fma:337case instr_class::valu_double:338case instr_class::valu_double_add:339case instr_class::valu_double_convert:340case instr_class::valu_double_transcendental:341case instr_class::vmem:342case instr_class::ds:343case instr_class::exp:344case instr_class::valu64:345case instr_class::valu_quarter_rate32:346case instr_class::valu_transcendental32: return true;347default: return false;348}349}350351void352BlockCycleEstimator::add(aco_ptr<Instruction>& instr)353{354perf_info perf = get_perf_info(program, instr);355356cur_cycle += get_dependency_cost(instr);357358unsigned start;359bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&360is_vector(instr->opcode) && program->workgroup_size > 32;361for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {362cur_cycle += cycles_until_res_available(instr);363364start = cur_cycle;365use_resources(instr);366367/* GCN is in-order and doesn't begin the next instruction until the current one finishes */368cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;369}370371wait_imm imm = get_wait_imm(program, instr);372while (lgkm.size() > imm.lgkm)373lgkm.pop_front();374while (exp.size() > imm.exp)375exp.pop_front();376while (vm.size() > imm.vm)377vm.pop_front();378while (vs.size() > imm.vs)379vs.pop_front();380381wait_counter_info wait_info = get_wait_counter_info(instr);382if (wait_info.exp)383exp.push_back(cur_cycle + wait_info.exp);384if (wait_info.lgkm)385lgkm.push_back(cur_cycle + wait_info.lgkm);386if (wait_info.vm)387vm.push_back(cur_cycle + wait_info.vm);388if (wait_info.vs)389vs.push_back(cur_cycle + wait_info.vs);390391/* This is inaccurate but shouldn't affect anything after waitcnt insertion.392* Before waitcnt insertion, this is necessary to consider memory operations.393*/394int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);395int32_t result_available = start + MAX2(perf.latency, latency);396397for (Definition& def : instr->definitions) {398int32_t* available = ®_available[def.physReg().reg()];399for (unsigned i = 0; i < def.size(); i++)400available[i] = MAX2(available[i], result_available);401}402}403404static void405join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)406{407for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)408queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);409for (int i = pred.size() - queue.size() - 1; i >= 0; i--)410queue.push_front(pred[i] + cycle_diff);411}412413void414BlockCycleEstimator::join(const BlockCycleEstimator& pred)415{416assert(cur_cycle == 0);417418for (unsigned i = 0; i < (unsigned)resource_count; i++) {419assert(res_usage[i] == 0);420res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);421}422423for (unsigned i = 0; i < 512; i++)424reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);425426join_queue(lgkm, pred.lgkm, -pred.cur_cycle);427join_queue(exp, pred.exp, -pred.cur_cycle);428join_queue(vm, pred.vm, -pred.cur_cycle);429join_queue(vs, pred.vs, -pred.cur_cycle);430}431432/* instructions/branches/vmem_clauses/smem_clauses/cycles */433void434collect_preasm_stats(Program* program)435{436for (Block& block : program->blocks) {437std::set<Instruction*> vmem_clause;438std::set<Instruction*> smem_clause;439440program->statistics[statistic_instructions] += block.instructions.size();441442for (aco_ptr<Instruction>& instr : block.instructions) {443if (instr->isSOPP() && instr->sopp().block != -1)444program->statistics[statistic_branches]++;445446if (instr->opcode == aco_opcode::p_constaddr)447program->statistics[statistic_instructions] += 2;448449if (instr->isVMEM() && !instr->operands.empty()) {450if (std::none_of(vmem_clause.begin(), vmem_clause.end(),451[&](Instruction* other)452{ return should_form_clause(instr.get(), other); }))453program->statistics[statistic_vmem_clauses]++;454vmem_clause.insert(instr.get());455} else {456vmem_clause.clear();457}458459if (instr->isSMEM() && !instr->operands.empty()) {460if (std::none_of(smem_clause.begin(), smem_clause.end(),461[&](Instruction* other)462{ return should_form_clause(instr.get(), other); }))463program->statistics[statistic_smem_clauses]++;464smem_clause.insert(instr.get());465} else {466smem_clause.clear();467}468}469}470471double latency = 0;472double usage[(int)BlockCycleEstimator::resource_count] = {0};473std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);474475for (Block& block : program->blocks) {476BlockCycleEstimator& block_est = blocks[block.index];477for (unsigned pred : block.linear_preds)478block_est.join(blocks[pred]);479480for (aco_ptr<Instruction>& instr : block.instructions) {481unsigned before = block_est.cur_cycle;482block_est.add(instr);483instr->pass_flags = block_est.cur_cycle - before;484}485486/* TODO: it would be nice to be able to consider estimated loop trip487* counts used for loop unrolling.488*/489490/* TODO: estimate the trip_count of divergent loops (those which break491* divergent) higher than of uniform loops492*/493494/* Assume loops execute 8-2 times, uniform branches are taken 50% the time,495* and any lane in the wave takes a side of a divergent branch 75% of the496* time.497*/498double iter = 1.0;499iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;500iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;501iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;502iter *= pow(0.5, block.uniform_if_depth);503iter *= pow(0.75, block.divergent_if_logical_depth);504505bool divergent_if_linear_else =506block.logical_preds.empty() && block.linear_preds.size() == 1 &&507block.linear_succs.size() == 1 &&508program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);509if (divergent_if_linear_else)510iter *= 0.25;511512latency += block_est.cur_cycle * iter;513for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)514usage[i] += block_est.res_usage[i] * iter;515}516517/* This likely exaggerates the effectiveness of parallelism because it518* ignores instruction ordering. It can assume there might be SALU/VALU/etc519* work to from other waves while one is idle but that might not be the case520* because those other waves have not reached such a point yet.521*/522523double parallelism = program->num_waves;524for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {525if (usage[i] > 0.0)526parallelism = MIN2(parallelism, latency / usage[i]);527}528double waves_per_cycle = 1.0 / latency * parallelism;529double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);530531double max_utilization = 1.0;532if (program->workgroup_size != UINT_MAX)533max_utilization =534program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);535wave64_per_cycle *= max_utilization;536537program->statistics[statistic_latency] = round(latency);538program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);539540if (debug_flags & DEBUG_PERF_INFO) {541aco_print_program(program, stderr, print_no_ssa | print_perf_info);542543fprintf(stderr, "num_waves: %u\n", program->num_waves);544fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);545fprintf(stderr, "branch_sendmsg_usage: %f\n",546usage[(int)BlockCycleEstimator::branch_sendmsg]);547fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);548fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);549fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);550fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);551fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);552fprintf(stderr, "latency: %f\n", latency);553fprintf(stderr, "parallelism: %f\n", parallelism);554fprintf(stderr, "max_utilization: %f\n", max_utilization);555fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);556fprintf(stderr, "\n");557}558}559560void561collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)562{563program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);564}565566} // namespace aco567568569