Path: blob/21.2-virgl/src/amd/compiler/aco_live_var_analysis.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2* Copyright © 2018 Google3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*23*/2425#include "aco_ir.h"2627#include "util/u_math.h"2829#include <set>30#include <vector>3132namespace aco {33RegisterDemand34get_live_changes(aco_ptr<Instruction>& instr)35{36RegisterDemand changes;37for (const Definition& def : instr->definitions) {38if (!def.isTemp() || def.isKill())39continue;40changes += def.getTemp();41}4243for (const Operand& op : instr->operands) {44if (!op.isTemp() || !op.isFirstKill())45continue;46changes -= op.getTemp();47}4849return changes;50}5152RegisterDemand53get_temp_registers(aco_ptr<Instruction>& instr)54{55RegisterDemand temp_registers;5657for (Definition def : instr->definitions) {58if (!def.isTemp())59continue;60if (def.isKill())61temp_registers += def.getTemp();62}6364for (Operand op : instr->operands) {65if (op.isTemp() && op.isLateKill() && op.isFirstKill())66temp_registers += op.getTemp();67}6869return temp_registers;70}7172RegisterDemand73get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,74aco_ptr<Instruction>& instr_before)75{76demand -= get_live_changes(instr);77demand -= get_temp_registers(instr);78if (instr_before)79demand += get_temp_registers(instr_before);80return demand;81}8283namespace {84void85process_live_temps_per_block(Program* program, live& lives, Block* block,86unsigned& worklist, std::vector<uint16_t>& phi_sgpr_ops)87{88std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];89RegisterDemand new_demand;9091register_demand.resize(block->instructions.size());92RegisterDemand block_register_demand;93IDSet live = lives.live_out[block->index];9495/* initialize register demand */96for (unsigned t : live)97new_demand += Temp(t, program->temp_rc[t]);98new_demand.sgpr -= phi_sgpr_ops[block->index];99100/* traverse the instructions backwards */101int idx;102for (idx = block->instructions.size() - 1; idx >= 0; idx--) {103Instruction* insn = block->instructions[idx].get();104if (is_phi(insn))105break;106107register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr);108109/* KILL */110for (Definition& definition : insn->definitions) {111if (!definition.isTemp()) {112continue;113}114if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)115program->needs_vcc = true;116117const Temp temp = definition.getTemp();118const size_t n = live.erase(temp.id());119120if (n) {121new_demand -= temp;122definition.setKill(false);123} else {124register_demand[idx] += temp;125definition.setKill(true);126}127}128129/* GEN */130if (insn->opcode == aco_opcode::p_logical_end) {131new_demand.sgpr += phi_sgpr_ops[block->index];132} else {133/* we need to do this in a separate loop because the next one can134* setKill() for several operands at once and we don't want to135* overwrite that in a later iteration */136for (Operand& op : insn->operands)137op.setKill(false);138139for (unsigned i = 0; i < insn->operands.size(); ++i) {140Operand& operand = insn->operands[i];141if (!operand.isTemp())142continue;143if (operand.isFixed() && operand.physReg() == vcc)144program->needs_vcc = true;145const Temp temp = operand.getTemp();146const bool inserted = live.insert(temp.id()).second;147if (inserted) {148operand.setFirstKill(true);149for (unsigned j = i + 1; j < insn->operands.size(); ++j) {150if (insn->operands[j].isTemp() &&151insn->operands[j].tempId() == operand.tempId()) {152insn->operands[j].setFirstKill(false);153insn->operands[j].setKill(true);154}155}156if (operand.isLateKill())157register_demand[idx] += temp;158new_demand += temp;159}160}161}162163block_register_demand.update(register_demand[idx]);164}165166/* update block's register demand for a last time */167block_register_demand.update(new_demand);168if (program->progress < CompilationProgress::after_ra)169block->register_demand = block_register_demand;170171/* handle phi definitions */172int phi_idx = idx;173while (phi_idx >= 0) {174register_demand[phi_idx] = new_demand;175Instruction* insn = block->instructions[phi_idx].get();176177assert(is_phi(insn) && insn->definitions.size() == 1);178if (!insn->definitions[0].isTemp()) {179assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec);180phi_idx--;181continue;182}183Definition& definition = insn->definitions[0];184if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)185program->needs_vcc = true;186const Temp temp = definition.getTemp();187const size_t n = live.erase(temp.id());188189if (n)190definition.setKill(false);191else192definition.setKill(true);193194phi_idx--;195}196197/* now, we need to merge the live-ins into the live-out sets */198for (unsigned t : live) {199RegClass rc = program->temp_rc[t];200std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;201202#ifndef NDEBUG203if (preds.empty())204aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,205block->index);206#endif207208for (unsigned pred_idx : preds) {209auto it = lives.live_out[pred_idx].insert(t);210if (it.second)211worklist = std::max(worklist, pred_idx + 1);212}213}214215/* handle phi operands */216phi_idx = idx;217while (phi_idx >= 0) {218Instruction* insn = block->instructions[phi_idx].get();219assert(is_phi(insn));220/* directly insert into the predecessors live-out set */221std::vector<unsigned>& preds =222insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;223for (unsigned i = 0; i < preds.size(); ++i) {224Operand& operand = insn->operands[i];225if (!operand.isTemp())226continue;227if (operand.isFixed() && operand.physReg() == vcc)228program->needs_vcc = true;229/* check if we changed an already processed block */230const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;231if (inserted) {232worklist = std::max(worklist, preds[i] + 1);233if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)234phi_sgpr_ops[preds[i]] += operand.size();235}236237/* set if the operand is killed by this (or another) phi instruction */238operand.setKill(!live.count(operand.tempId()));239}240phi_idx--;241}242243assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));244}245246unsigned247calc_waves_per_workgroup(Program* program)248{249/* When workgroup size is not known, just go with wave_size */250unsigned workgroup_size =251program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;252253return align(workgroup_size, program->wave_size) / program->wave_size;254}255} /* end namespace */256257uint16_t258get_extra_sgprs(Program* program)259{260if (program->chip_class >= GFX10) {261assert(!program->needs_flat_scr);262assert(!program->dev.xnack_enabled);263return 0;264} else if (program->chip_class >= GFX8) {265if (program->needs_flat_scr)266return 6;267else if (program->dev.xnack_enabled)268return 4;269else if (program->needs_vcc)270return 2;271else272return 0;273} else {274assert(!program->dev.xnack_enabled);275if (program->needs_flat_scr)276return 4;277else if (program->needs_vcc)278return 2;279else280return 0;281}282}283284uint16_t285get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)286{287uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);288uint16_t granule = program->dev.sgpr_alloc_granule;289return ALIGN_NPOT(std::max(sgprs, granule), granule);290}291292uint16_t293get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)294{295assert(addressable_vgprs <= program->dev.vgpr_limit);296uint16_t granule = program->dev.vgpr_alloc_granule;297return align(std::max(addressable_vgprs, granule), granule);298}299300unsigned301round_down(unsigned a, unsigned b)302{303return a - (a % b);304}305306uint16_t307get_addr_sgpr_from_waves(Program* program, uint16_t waves)308{309/* it's not possible to allocate more than 128 SGPRs */310uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);311sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);312sgprs -= get_extra_sgprs(program);313return std::min(sgprs, program->dev.sgpr_limit);314}315316uint16_t317get_addr_vgpr_from_waves(Program* program, uint16_t waves)318{319uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);320vgprs -= program->config->num_shared_vgprs / 2;321return std::min(vgprs, program->dev.vgpr_limit);322}323324void325calc_min_waves(Program* program)326{327unsigned waves_per_workgroup = calc_waves_per_workgroup(program);328unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);329program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);330}331332void333update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)334{335unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);336unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);337unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;338339assert(program->min_waves >= 1);340uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);341uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);342343/* this won't compile, register pressure reduction necessary */344if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {345program->num_waves = 0;346program->max_reg_demand = new_demand;347} else {348program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);349uint16_t vgpr_demand =350get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;351program->num_waves =352std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);353program->max_waves = max_waves_per_simd;354355/* adjust max_waves for workgroup and LDS limits */356unsigned waves_per_workgroup = calc_waves_per_workgroup(program);357unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;358if (program->config->lds_size) {359unsigned lds = program->config->lds_size * program->dev.lds_encoding_granule;360lds = align(lds, program->dev.lds_alloc_granule);361workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);362}363if (waves_per_workgroup > 1 && program->chip_class < GFX10)364workgroups_per_cu_wgp = std::min(365workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */366367/* in cases like waves_per_workgroup=3 or lds=65536 and368* waves_per_workgroup=1, we want the maximum possible number of waves per369* SIMD and not the minimum. so DIV_ROUND_UP is used */370program->max_waves = std::min<uint16_t>(371program->max_waves,372DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));373374/* incorporate max_waves and calculate max_reg_demand */375program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);376program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);377program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);378}379}380381live382live_var_analysis(Program* program)383{384live result;385result.live_out.resize(program->blocks.size());386result.register_demand.resize(program->blocks.size());387unsigned worklist = program->blocks.size();388std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());389RegisterDemand new_demand;390391program->needs_vcc = false;392393/* this implementation assumes that the block idx corresponds to the block's position in394* program->blocks vector */395while (worklist) {396unsigned block_idx = --worklist;397process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,398phi_sgpr_ops);399new_demand.update(program->blocks[block_idx].register_demand);400}401402/* calculate the program's register demand and number of waves */403if (program->progress < CompilationProgress::after_ra)404update_vgpr_sgpr_demand(program, new_demand);405406return result;407}408409} // namespace aco410411412