Path: blob/21.2-virgl/src/amd/compiler/aco_spill.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2* Copyright © 2018 Google3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*23*/2425#include "aco_builder.h"26#include "aco_ir.h"2728#include "common/sid.h"2930#include <map>31#include <set>32#include <stack>33#include <unordered_set>34#include <vector>3536/*37* Implements the spilling algorithm on SSA-form from38* "Register Spilling and Live-Range Splitting for SSA-Form Programs"39* by Matthias Braun and Sebastian Hack.40*/4142namespace aco {4344namespace {4546struct remat_info {47Instruction* instr;48};4950struct spill_ctx {51RegisterDemand target_pressure;52Program* program;53std::vector<std::vector<RegisterDemand>> register_demand;54std::vector<std::map<Temp, Temp>> renames;55std::vector<std::map<Temp, uint32_t>> spills_entry;56std::vector<std::map<Temp, uint32_t>> spills_exit;57std::vector<bool> processed;58std::stack<Block*> loop_header;59std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;60std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;61std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;62std::vector<std::vector<uint32_t>> affinities;63std::vector<bool> is_reloaded;64std::map<Temp, remat_info> remat;65std::map<Instruction*, bool> remat_used;66unsigned wave_size;6768spill_ctx(const RegisterDemand target_pressure_, Program* program_,69std::vector<std::vector<RegisterDemand>> register_demand_)70: target_pressure(target_pressure_), program(program_),71register_demand(std::move(register_demand_)), renames(program->blocks.size()),72spills_entry(program->blocks.size()), spills_exit(program->blocks.size()),73processed(program->blocks.size(), false), wave_size(program->wave_size)74{}7576void add_affinity(uint32_t first, uint32_t second)77{78unsigned found_first = affinities.size();79unsigned found_second = affinities.size();80for (unsigned i = 0; i < affinities.size(); i++) {81std::vector<uint32_t>& vec = affinities[i];82for (uint32_t entry : vec) {83if (entry == first)84found_first = i;85else if (entry == second)86found_second = i;87}88}89if (found_first == affinities.size() && found_second == affinities.size()) {90affinities.emplace_back(std::vector<uint32_t>({first, second}));91} else if (found_first < affinities.size() && found_second == affinities.size()) {92affinities[found_first].push_back(second);93} else if (found_second < affinities.size() && found_first == affinities.size()) {94affinities[found_second].push_back(first);95} else if (found_first != found_second) {96/* merge second into first */97affinities[found_first].insert(affinities[found_first].end(),98affinities[found_second].begin(),99affinities[found_second].end());100affinities.erase(std::next(affinities.begin(), found_second));101} else {102assert(found_first == found_second);103}104}105106void add_interference(uint32_t first, uint32_t second)107{108if (interferences[first].first.type() != interferences[second].first.type())109return;110111bool inserted = interferences[first].second.insert(second).second;112if (inserted)113interferences[second].second.insert(first);114}115116uint32_t allocate_spill_id(RegClass rc)117{118interferences.emplace_back(rc, std::unordered_set<uint32_t>());119is_reloaded.push_back(false);120return next_spill_id++;121}122123uint32_t next_spill_id = 0;124};125126int32_t127get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)128{129130if (idx_a == -1)131return idx_b;132if (idx_b == -1)133return idx_a;134if (is_linear) {135while (idx_a != idx_b) {136if (idx_a > idx_b)137idx_a = program->blocks[idx_a].linear_idom;138else139idx_b = program->blocks[idx_b].linear_idom;140}141} else {142while (idx_a != idx_b) {143if (idx_a > idx_b)144idx_a = program->blocks[idx_a].logical_idom;145else146idx_b = program->blocks[idx_b].logical_idom;147}148}149assert(idx_a != -1);150return idx_a;151}152153void154next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)155{156Block* block = &ctx.program->blocks[block_idx];157std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];158159/* to compute the next use distance at the beginning of the block, we have to add the block's160* size */161for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin();162it != next_uses.end(); ++it)163it->second.second = it->second.second + block->instructions.size();164165int idx = block->instructions.size() - 1;166while (idx >= 0) {167aco_ptr<Instruction>& instr = block->instructions[idx];168169if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)170break;171172for (const Definition& def : instr->definitions) {173if (def.isTemp())174next_uses.erase(def.getTemp());175}176177for (const Operand& op : instr->operands) {178/* omit exec mask */179if (op.isFixed() && op.physReg() == exec)180continue;181if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())182continue;183if (op.isTemp())184next_uses[op.getTemp()] = {block_idx, idx};185}186idx--;187}188189assert(block_idx != 0 || next_uses.empty());190ctx.next_use_distances_start[block_idx] = next_uses;191while (idx >= 0) {192aco_ptr<Instruction>& instr = block->instructions[idx];193assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);194195if (!instr->definitions[0].isTemp()) {196idx--;197continue;198}199200auto it = next_uses.find(instr->definitions[0].getTemp());201std::pair<uint32_t, uint32_t> distance =202it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;203for (unsigned i = 0; i < instr->operands.size(); i++) {204unsigned pred_idx =205instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i];206if (instr->operands[i].isTemp()) {207if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) ==208ctx.next_use_distances_end[pred_idx].end() ||209ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance)210worklist.insert(pred_idx);211ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance;212}213}214next_uses.erase(instr->definitions[0].getTemp());215idx--;216}217218/* all remaining live vars must be live-out at the predecessors */219for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {220Temp temp = pair.first;221uint32_t distance = pair.second.second;222uint32_t dom = pair.second.first;223std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;224for (unsigned pred_idx : preds) {225if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)226distance += 0xFFFF;227if (ctx.next_use_distances_end[pred_idx].find(temp) !=228ctx.next_use_distances_end[pred_idx].end()) {229dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program,230temp.is_linear());231distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);232}233if (ctx.next_use_distances_end[pred_idx][temp] !=234std::pair<uint32_t, uint32_t>{dom, distance})235worklist.insert(pred_idx);236ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};237}238}239}240241void242compute_global_next_uses(spill_ctx& ctx)243{244ctx.next_use_distances_start.resize(ctx.program->blocks.size());245ctx.next_use_distances_end.resize(ctx.program->blocks.size());246std::set<uint32_t> worklist;247for (Block& block : ctx.program->blocks)248worklist.insert(block.index);249250while (!worklist.empty()) {251std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();252unsigned block_idx = *b_it;253worklist.erase(block_idx);254next_uses_per_block(ctx, block_idx, worklist);255}256}257258bool259should_rematerialize(aco_ptr<Instruction>& instr)260{261/* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */262if (instr->format != Format::VOP1 && instr->format != Format::SOP1 &&263instr->format != Format::PSEUDO && instr->format != Format::SOPK)264return false;265/* TODO: pseudo-instruction rematerialization is only supported for266* p_create_vector/p_parallelcopy */267if (instr->isPseudo() && instr->opcode != aco_opcode::p_create_vector &&268instr->opcode != aco_opcode::p_parallelcopy)269return false;270if (instr->isSOPK() && instr->opcode != aco_opcode::s_movk_i32)271return false;272273for (const Operand& op : instr->operands) {274/* TODO: rematerialization using temporaries isn't yet supported */275if (!op.isConstant())276return false;277}278279/* TODO: rematerialization with multiple definitions isn't yet supported */280if (instr->definitions.size() > 1)281return false;282283return true;284}285286aco_ptr<Instruction>287do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)288{289std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);290if (remat != ctx.remat.end()) {291Instruction* instr = remat->second.instr;292assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&293"unsupported");294assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector ||295instr->opcode == aco_opcode::p_parallelcopy) &&296"unsupported");297assert(instr->definitions.size() == 1 && "unsupported");298299aco_ptr<Instruction> res;300if (instr->isVOP1()) {301res.reset(create_instruction<VOP1_instruction>(302instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));303} else if (instr->isSOP1()) {304res.reset(create_instruction<SOP1_instruction>(305instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));306} else if (instr->isPseudo()) {307res.reset(create_instruction<Pseudo_instruction>(308instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));309} else if (instr->isSOPK()) {310res.reset(create_instruction<SOPK_instruction>(311instr->opcode, instr->format, instr->operands.size(), instr->definitions.size()));312res->sopk().imm = instr->sopk().imm;313}314for (unsigned i = 0; i < instr->operands.size(); i++) {315res->operands[i] = instr->operands[i];316if (instr->operands[i].isTemp()) {317assert(false && "unsupported");318if (ctx.remat.count(instr->operands[i].getTemp()))319ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;320}321}322res->definitions[0] = Definition(new_name);323return res;324} else {325aco_ptr<Pseudo_instruction> reload{326create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};327reload->operands[0] = Operand::c32(spill_id);328reload->definitions[0] = Definition(new_name);329ctx.is_reloaded[spill_id] = true;330return reload;331}332}333334void335get_rematerialize_info(spill_ctx& ctx)336{337for (Block& block : ctx.program->blocks) {338bool logical = false;339for (aco_ptr<Instruction>& instr : block.instructions) {340if (instr->opcode == aco_opcode::p_logical_start)341logical = true;342else if (instr->opcode == aco_opcode::p_logical_end)343logical = false;344if (logical && should_rematerialize(instr)) {345for (const Definition& def : instr->definitions) {346if (def.isTemp()) {347ctx.remat[def.getTemp()] = remat_info{instr.get()};348ctx.remat_used[instr.get()] = false;349}350}351}352}353}354}355356std::vector<std::map<Temp, uint32_t>>357local_next_uses(spill_ctx& ctx, Block* block)358{359std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());360361std::map<Temp, uint32_t> next_uses;362for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :363ctx.next_use_distances_end[block->index])364next_uses[pair.first] = pair.second.second + block->instructions.size();365366for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {367aco_ptr<Instruction>& instr = block->instructions[idx];368if (!instr)369break;370if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)371break;372373for (const Operand& op : instr->operands) {374if (op.isFixed() && op.physReg() == exec)375continue;376if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())377continue;378if (op.isTemp())379next_uses[op.getTemp()] = idx;380}381for (const Definition& def : instr->definitions) {382if (def.isTemp())383next_uses.erase(def.getTemp());384}385local_next_uses[idx] = next_uses;386}387return local_next_uses;388}389390RegisterDemand391get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx)392{393if (idx == 0) {394RegisterDemand demand = ctx.register_demand[block_idx][idx];395aco_ptr<Instruction>& instr = ctx.program->blocks[block_idx].instructions[idx];396aco_ptr<Instruction> instr_before(nullptr);397return get_demand_before(demand, instr, instr_before);398} else {399return ctx.register_demand[block_idx][idx - 1];400}401}402403RegisterDemand404get_live_in_demand(spill_ctx& ctx, unsigned block_idx)405{406unsigned idx = 0;407RegisterDemand reg_pressure = RegisterDemand();408Block& block = ctx.program->blocks[block_idx];409for (aco_ptr<Instruction>& phi : block.instructions) {410if (!is_phi(phi))411break;412idx++;413414/* Killed phi definitions increase pressure in the predecessor but not415* the block they're in. Since the loops below are both to control416* pressure of the start of this block and the ends of it's417* predecessors, we need to count killed unspilled phi definitions here. */418if (phi->definitions[0].isTemp() && phi->definitions[0].isKill() &&419!ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()))420reg_pressure += phi->definitions[0].getTemp();421}422423reg_pressure += get_demand_before(ctx, block_idx, idx);424425/* Consider register pressure from linear predecessors. This can affect426* reg_pressure if the branch instructions define sgprs. */427for (unsigned pred : block.linear_preds)428reg_pressure.sgpr =429std::max<int16_t>(reg_pressure.sgpr, ctx.register_demand[pred].back().sgpr);430431return reg_pressure;432}433434RegisterDemand435init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)436{437RegisterDemand spilled_registers;438439/* first block, nothing was spilled before */440if (block_idx == 0)441return {0, 0};442443/* next use distances at the beginning of the current block */444auto& next_use_distances = ctx.next_use_distances_start[block_idx];445446/* loop header block */447if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {448assert(block->linear_preds[0] == block_idx - 1);449assert(block->logical_preds[0] == block_idx - 1);450451/* create new loop_info */452ctx.loop_header.emplace(block);453454/* check how many live-through variables should be spilled */455RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);456RegisterDemand loop_demand = reg_pressure;457unsigned i = block_idx;458while (ctx.program->blocks[i].loop_nest_depth >= block->loop_nest_depth) {459assert(ctx.program->blocks.size() > i);460loop_demand.update(ctx.program->blocks[i++].register_demand);461}462unsigned loop_end = i;463464for (auto spilled : ctx.spills_exit[block_idx - 1]) {465auto it = next_use_distances.find(spilled.first);466467/* variable is not live at loop entry: probably a phi operand */468if (it == next_use_distances.end())469continue;470471/* keep constants and live-through variables spilled */472if (it->second.first >= loop_end || ctx.remat.count(spilled.first)) {473ctx.spills_entry[block_idx][spilled.first] = spilled.second;474spilled_registers += spilled.first;475loop_demand -= spilled.first;476}477}478479/* select live-through variables and constants */480RegType type = RegType::vgpr;481while (loop_demand.exceeds(ctx.target_pressure)) {482/* if VGPR demand is low enough, select SGPRs */483if (type == RegType::vgpr && loop_demand.vgpr <= ctx.target_pressure.vgpr)484type = RegType::sgpr;485/* if SGPR demand is low enough, break */486if (type == RegType::sgpr && loop_demand.sgpr <= ctx.target_pressure.sgpr)487break;488489unsigned distance = 0;490Temp to_spill;491for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {492if (pair.first.type() == type &&493(pair.second.first >= loop_end ||494(ctx.remat.count(pair.first) && type == RegType::sgpr)) &&495pair.second.second > distance &&496ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {497to_spill = pair.first;498distance = pair.second.second;499}500}501502/* select SGPRs or break */503if (distance == 0) {504if (type == RegType::sgpr)505break;506type = RegType::sgpr;507continue;508}509510uint32_t spill_id;511if (ctx.spills_exit[block_idx - 1].find(to_spill) ==512ctx.spills_exit[block_idx - 1].end()) {513spill_id = ctx.allocate_spill_id(to_spill.regClass());514} else {515spill_id = ctx.spills_exit[block_idx - 1][to_spill];516}517518ctx.spills_entry[block_idx][to_spill] = spill_id;519spilled_registers += to_spill;520loop_demand -= to_spill;521}522523/* shortcut */524if (!loop_demand.exceeds(ctx.target_pressure))525return spilled_registers;526527/* if reg pressure is too high at beginning of loop, add variables with furthest use */528reg_pressure -= spilled_registers;529530while (reg_pressure.exceeds(ctx.target_pressure)) {531unsigned distance = 0;532Temp to_spill;533type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;534535for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {536if (pair.first.type() == type && pair.second.second > distance &&537ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {538to_spill = pair.first;539distance = pair.second.second;540}541}542assert(distance != 0);543ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());544spilled_registers += to_spill;545reg_pressure -= to_spill;546}547548return spilled_registers;549}550551/* branch block */552if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) {553/* keep variables spilled if they are alive and not used in the current block */554unsigned pred_idx = block->linear_preds[0];555for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {556if (pair.first.type() == RegType::sgpr &&557next_use_distances.find(pair.first) != next_use_distances.end() &&558next_use_distances[pair.first].first != block_idx) {559ctx.spills_entry[block_idx].insert(pair);560spilled_registers.sgpr += pair.first.size();561}562}563if (block->logical_preds.size() == 1) {564pred_idx = block->logical_preds[0];565for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {566if (pair.first.type() == RegType::vgpr &&567next_use_distances.find(pair.first) != next_use_distances.end() &&568next_use_distances[pair.first].first != block_idx) {569ctx.spills_entry[block_idx].insert(pair);570spilled_registers.vgpr += pair.first.size();571}572}573}574575/* if register demand is still too high, we just keep all spilled live vars576* and process the block */577if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {578pred_idx = block->linear_preds[0];579for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {580if (pair.first.type() == RegType::sgpr &&581next_use_distances.find(pair.first) != next_use_distances.end() &&582ctx.spills_entry[block_idx].insert(pair).second) {583spilled_registers.sgpr += pair.first.size();584}585}586}587if (block->register_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr &&588block->logical_preds.size() == 1) {589pred_idx = block->logical_preds[0];590for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {591if (pair.first.type() == RegType::vgpr &&592next_use_distances.find(pair.first) != next_use_distances.end() &&593ctx.spills_entry[block_idx].insert(pair).second) {594spilled_registers.vgpr += pair.first.size();595}596}597}598599return spilled_registers;600}601602/* else: merge block */603std::set<Temp> partial_spills;604605/* keep variables spilled on all incoming paths */606for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {607std::vector<unsigned>& preds =608pair.first.is_linear() ? block->linear_preds : block->logical_preds;609/* If it can be rematerialized, keep the variable spilled if all predecessors do not reload610* it. Otherwise, if any predecessor reloads it, ensure it's reloaded on all other611* predecessors. The idea is that it's better in practice to rematerialize redundantly than to612* create lots of phis. */613/* TODO: test this idea with more than Dawn of War III shaders (the current pipeline-db614* doesn't seem to exercise this path much) */615bool remat = ctx.remat.count(pair.first);616bool spill = !remat;617uint32_t spill_id = 0;618for (unsigned pred_idx : preds) {619/* variable is not even live at the predecessor: probably from a phi */620if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==621ctx.next_use_distances_end[pred_idx].end()) {622spill = false;623break;624}625if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {626if (!remat)627spill = false;628} else {629partial_spills.insert(pair.first);630/* it might be that on one incoming path, the variable has a different spill_id, but631* add_couple_code() will take care of that. */632spill_id = ctx.spills_exit[pred_idx][pair.first];633if (remat)634spill = true;635}636}637if (spill) {638ctx.spills_entry[block_idx][pair.first] = spill_id;639partial_spills.erase(pair.first);640spilled_registers += pair.first;641}642}643644/* same for phis */645for (aco_ptr<Instruction>& phi : block->instructions) {646if (!is_phi(phi))647break;648if (!phi->definitions[0].isTemp())649continue;650651std::vector<unsigned>& preds =652phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;653bool spill = true;654655for (unsigned i = 0; i < phi->operands.size(); i++) {656/* non-temp operands can increase the register pressure */657if (!phi->operands[i].isTemp()) {658partial_spills.insert(phi->definitions[0].getTemp());659continue;660}661662if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) ==663ctx.spills_exit[preds[i]].end())664spill = false;665else666partial_spills.insert(phi->definitions[0].getTemp());667}668if (spill) {669ctx.spills_entry[block_idx][phi->definitions[0].getTemp()] =670ctx.allocate_spill_id(phi->definitions[0].regClass());671partial_spills.erase(phi->definitions[0].getTemp());672spilled_registers += phi->definitions[0].getTemp();673}674}675676/* if reg pressure at first instruction is still too high, add partially spilled variables */677RegisterDemand reg_pressure = get_live_in_demand(ctx, block_idx);678reg_pressure -= spilled_registers;679680while (reg_pressure.exceeds(ctx.target_pressure)) {681assert(!partial_spills.empty());682std::set<Temp>::iterator it = partial_spills.begin();683Temp to_spill = Temp();684unsigned distance = 0;685RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;686687while (it != partial_spills.end()) {688assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());689690if (it->type() == type && next_use_distances[*it].second > distance) {691distance = next_use_distances[*it].second;692to_spill = *it;693}694++it;695}696assert(distance != 0);697698ctx.spills_entry[block_idx][to_spill] = ctx.allocate_spill_id(to_spill.regClass());699partial_spills.erase(to_spill);700spilled_registers += to_spill;701reg_pressure -= to_spill;702}703704return spilled_registers;705}706707void708add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)709{710/* no coupling code necessary */711if (block->linear_preds.size() == 0)712return;713714std::vector<aco_ptr<Instruction>> instructions;715/* branch block: TODO take other branch into consideration */716if (block->linear_preds.size() == 1 &&717!(block->kind & (block_kind_loop_exit | block_kind_loop_header))) {718assert(ctx.processed[block->linear_preds[0]]);719assert(ctx.register_demand[block_idx].size() == block->instructions.size());720std::vector<RegisterDemand> reg_demand;721unsigned insert_idx = 0;722RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);723724for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :725ctx.next_use_distances_start[block_idx]) {726const unsigned pred_idx = block->linear_preds[0];727728if (!live.first.is_linear())729continue;730/* still spilled */731if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())732continue;733734/* in register at end of predecessor */735if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {736std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);737if (it != ctx.renames[pred_idx].end())738ctx.renames[block_idx].insert(*it);739continue;740}741742/* variable is spilled at predecessor and live at current block: create reload instruction */743Temp new_name = ctx.program->allocateTmp(live.first.regClass());744aco_ptr<Instruction> reload =745do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);746instructions.emplace_back(std::move(reload));747reg_demand.push_back(demand_before);748ctx.renames[block_idx][live.first] = new_name;749}750751if (block->logical_preds.size() == 1) {752do {753assert(insert_idx < block->instructions.size());754instructions.emplace_back(std::move(block->instructions[insert_idx]));755reg_demand.push_back(ctx.register_demand[block_idx][insert_idx]);756insert_idx++;757} while (instructions.back()->opcode != aco_opcode::p_logical_start);758759unsigned pred_idx = block->logical_preds[0];760for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :761ctx.next_use_distances_start[block_idx]) {762if (live.first.is_linear())763continue;764/* still spilled */765if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())766continue;767768/* in register at end of predecessor */769if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {770std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);771if (it != ctx.renames[pred_idx].end())772ctx.renames[block_idx].insert(*it);773continue;774}775776/* variable is spilled at predecessor and live at current block:777* create reload instruction */778Temp new_name = ctx.program->allocateTmp(live.first.regClass());779aco_ptr<Instruction> reload =780do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);781instructions.emplace_back(std::move(reload));782reg_demand.emplace_back(reg_demand.back());783ctx.renames[block_idx][live.first] = new_name;784}785}786787/* combine new reload instructions with original block */788if (!instructions.empty()) {789reg_demand.insert(reg_demand.end(),790std::next(ctx.register_demand[block->index].begin(), insert_idx),791ctx.register_demand[block->index].end());792ctx.register_demand[block_idx] = std::move(reg_demand);793instructions.insert(instructions.end(),794std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(795std::next(block->instructions.begin(), insert_idx)),796std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(797block->instructions.end()));798block->instructions = std::move(instructions);799}800return;801}802803/* loop header and merge blocks: check if all (linear) predecessors have been processed */804for (ASSERTED unsigned pred : block->linear_preds)805assert(ctx.processed[pred]);806807/* iterate the phi nodes for which operands to spill at the predecessor */808for (aco_ptr<Instruction>& phi : block->instructions) {809if (!is_phi(phi))810break;811812/* if the phi is not spilled, add to instructions */813if (!phi->definitions[0].isTemp() ||814ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==815ctx.spills_entry[block_idx].end()) {816instructions.emplace_back(std::move(phi));817continue;818}819820std::vector<unsigned>& preds =821phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;822uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];823824for (unsigned i = 0; i < phi->operands.size(); i++) {825if (phi->operands[i].isUndefined())826continue;827828unsigned pred_idx = preds[i];829Operand spill_op = phi->operands[i];830831if (spill_op.isTemp()) {832assert(phi->operands[i].isKill());833Temp var = phi->operands[i].getTemp();834835std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);836/* prevent the definining instruction from being DCE'd if it could be rematerialized */837if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))838ctx.remat_used[ctx.remat[var].instr] = true;839840/* check if variable is already spilled at predecessor */841std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);842if (spilled != ctx.spills_exit[pred_idx].end()) {843if (spilled->second != def_spill_id)844ctx.add_affinity(def_spill_id, spilled->second);845continue;846}847848/* rename if necessary */849if (rename_it != ctx.renames[pred_idx].end()) {850spill_op.setTemp(rename_it->second);851ctx.renames[pred_idx].erase(rename_it);852}853}854855uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());856857/* add interferences and affinity */858for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])859ctx.add_interference(spill_id, pair.second);860ctx.add_affinity(def_spill_id, spill_id);861862aco_ptr<Pseudo_instruction> spill{863create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};864spill->operands[0] = spill_op;865spill->operands[1] = Operand::c32(spill_id);866Block& pred = ctx.program->blocks[pred_idx];867unsigned idx = pred.instructions.size();868do {869assert(idx != 0);870idx--;871} while (phi->opcode == aco_opcode::p_phi &&872pred.instructions[idx]->opcode != aco_opcode::p_logical_end);873std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);874pred.instructions.insert(it, std::move(spill));875if (spill_op.isTemp())876ctx.spills_exit[pred_idx][spill_op.getTemp()] = spill_id;877}878879/* remove phi from instructions */880phi.reset();881}882883/* iterate all (other) spilled variables for which to spill at the predecessor */884// TODO: would be better to have them sorted: first vgprs and first with longest distance885for (std::pair<Temp, uint32_t> pair : ctx.spills_entry[block_idx]) {886std::vector<unsigned> preds =887pair.first.is_linear() ? block->linear_preds : block->logical_preds;888889for (unsigned pred_idx : preds) {890/* variable is already spilled at predecessor */891std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);892if (spilled != ctx.spills_exit[pred_idx].end()) {893if (spilled->second != pair.second)894ctx.add_affinity(pair.second, spilled->second);895continue;896}897898/* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */899if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==900ctx.next_use_distances_end[pred_idx].end())901continue;902903/* add interferences between spilled variable and predecessors exit spills */904for (std::pair<Temp, uint32_t> exit_spill : ctx.spills_exit[pred_idx]) {905if (exit_spill.first == pair.first)906continue;907ctx.add_interference(exit_spill.second, pair.second);908}909910/* variable is in register at predecessor and has to be spilled */911/* rename if necessary */912Temp var = pair.first;913std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);914if (rename_it != ctx.renames[pred_idx].end()) {915var = rename_it->second;916ctx.renames[pred_idx].erase(rename_it);917}918919aco_ptr<Pseudo_instruction> spill{920create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};921spill->operands[0] = Operand(var);922spill->operands[1] = Operand::c32(pair.second);923Block& pred = ctx.program->blocks[pred_idx];924unsigned idx = pred.instructions.size();925do {926assert(idx != 0);927idx--;928} while (pair.first.type() == RegType::vgpr &&929pred.instructions[idx]->opcode != aco_opcode::p_logical_end);930std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);931pred.instructions.insert(it, std::move(spill));932ctx.spills_exit[pred.index][pair.first] = pair.second;933}934}935936/* iterate phis for which operands to reload */937for (aco_ptr<Instruction>& phi : instructions) {938assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);939assert(!phi->definitions[0].isTemp() ||940ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==941ctx.spills_entry[block_idx].end());942943std::vector<unsigned>& preds =944phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;945for (unsigned i = 0; i < phi->operands.size(); i++) {946if (!phi->operands[i].isTemp())947continue;948unsigned pred_idx = preds[i];949950/* if the operand was reloaded, rename */951if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) ==952ctx.spills_exit[pred_idx].end()) {953std::map<Temp, Temp>::iterator it =954ctx.renames[pred_idx].find(phi->operands[i].getTemp());955if (it != ctx.renames[pred_idx].end())956phi->operands[i].setTemp(it->second);957/* prevent the definining instruction from being DCE'd if it could be rematerialized */958else if (ctx.remat.count(phi->operands[i].getTemp()))959ctx.remat_used[ctx.remat[phi->operands[i].getTemp()].instr] = true;960continue;961}962963Temp tmp = phi->operands[i].getTemp();964965/* reload phi operand at end of predecessor block */966Temp new_name = ctx.program->allocateTmp(tmp.regClass());967Block& pred = ctx.program->blocks[pred_idx];968unsigned idx = pred.instructions.size();969do {970assert(idx != 0);971idx--;972} while (phi->opcode == aco_opcode::p_phi &&973pred.instructions[idx]->opcode != aco_opcode::p_logical_end);974std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);975aco_ptr<Instruction> reload =976do_reload(ctx, tmp, new_name, ctx.spills_exit[pred_idx][tmp]);977978/* reload spilled exec mask directly to exec */979if (!phi->definitions[0].isTemp()) {980assert(phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec);981reload->definitions[0] = phi->definitions[0];982phi->operands[i] = Operand(exec, ctx.program->lane_mask);983} else {984ctx.spills_exit[pred_idx].erase(tmp);985ctx.renames[pred_idx][tmp] = new_name;986phi->operands[i].setTemp(new_name);987}988989pred.instructions.insert(it, std::move(reload));990}991}992993/* iterate live variables for which to reload */994// TODO: reload at current block if variable is spilled on all predecessors995for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :996ctx.next_use_distances_start[block_idx]) {997/* skip spilled variables */998if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())999continue;1000std::vector<unsigned> preds =1001pair.first.is_linear() ? block->linear_preds : block->logical_preds;10021003/* variable is dead at predecessor, it must be from a phi */1004bool is_dead = false;1005for (unsigned pred_idx : preds) {1006if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==1007ctx.next_use_distances_end[pred_idx].end())1008is_dead = true;1009}1010if (is_dead)1011continue;1012for (unsigned pred_idx : preds) {1013/* the variable is not spilled at the predecessor */1014if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())1015continue;10161017/* variable is spilled at predecessor and has to be reloaded */1018Temp new_name = ctx.program->allocateTmp(pair.first.regClass());1019Block& pred = ctx.program->blocks[pred_idx];1020unsigned idx = pred.instructions.size();1021do {1022assert(idx != 0);1023idx--;1024} while (pair.first.type() == RegType::vgpr &&1025pred.instructions[idx]->opcode != aco_opcode::p_logical_end);1026std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);10271028aco_ptr<Instruction> reload =1029do_reload(ctx, pair.first, new_name, ctx.spills_exit[pred.index][pair.first]);1030pred.instructions.insert(it, std::move(reload));10311032ctx.spills_exit[pred.index].erase(pair.first);1033ctx.renames[pred.index][pair.first] = new_name;1034}10351036/* check if we have to create a new phi for this variable */1037Temp rename = Temp();1038bool is_same = true;1039for (unsigned pred_idx : preds) {1040if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {1041if (rename == Temp())1042rename = pair.first;1043else1044is_same = rename == pair.first;1045} else {1046if (rename == Temp())1047rename = ctx.renames[pred_idx][pair.first];1048else1049is_same = rename == ctx.renames[pred_idx][pair.first];1050}10511052if (!is_same)1053break;1054}10551056if (!is_same) {1057/* the variable was renamed differently in the predecessors: we have to create a phi */1058aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;1059aco_ptr<Pseudo_instruction> phi{1060create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};1061rename = ctx.program->allocateTmp(pair.first.regClass());1062for (unsigned i = 0; i < phi->operands.size(); i++) {1063Temp tmp;1064if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) {1065tmp = ctx.renames[preds[i]][pair.first];1066} else if (preds[i] >= block_idx) {1067tmp = rename;1068} else {1069tmp = pair.first;1070/* prevent the definining instruction from being DCE'd if it could be rematerialized */1071if (ctx.remat.count(tmp))1072ctx.remat_used[ctx.remat[tmp].instr] = true;1073}1074phi->operands[i] = Operand(tmp);1075}1076phi->definitions[0] = Definition(rename);1077instructions.emplace_back(std::move(phi));1078}10791080/* the variable was renamed: add new name to renames */1081if (!(rename == Temp() || rename == pair.first))1082ctx.renames[block_idx][pair.first] = rename;1083}10841085/* combine phis with instructions */1086unsigned idx = 0;1087while (!block->instructions[idx]) {1088idx++;1089}10901091if (!ctx.processed[block_idx]) {1092assert(!(block->kind & block_kind_loop_header));1093RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx);1094ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(),1095ctx.register_demand[block->index].begin() + idx);1096ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(),1097instructions.size(), demand_before);1098}10991100std::vector<aco_ptr<Instruction>>::iterator start = std::next(block->instructions.begin(), idx);1101instructions.insert(1102instructions.end(), std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(start),1103std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));1104block->instructions = std::move(instructions);1105}11061107void1108process_block(spill_ctx& ctx, unsigned block_idx, Block* block,1109std::map<Temp, uint32_t>& current_spills, RegisterDemand spilled_registers)1110{1111assert(!ctx.processed[block_idx]);11121113std::vector<std::map<Temp, uint32_t>> local_next_use_distance;1114std::vector<aco_ptr<Instruction>> instructions;1115unsigned idx = 0;11161117/* phis are handled separetely */1118while (block->instructions[idx]->opcode == aco_opcode::p_phi ||1119block->instructions[idx]->opcode == aco_opcode::p_linear_phi) {1120instructions.emplace_back(std::move(block->instructions[idx++]));1121}11221123if (block->register_demand.exceeds(ctx.target_pressure))1124local_next_use_distance = local_next_uses(ctx, block);11251126while (idx < block->instructions.size()) {1127aco_ptr<Instruction>& instr = block->instructions[idx];11281129std::map<Temp, std::pair<Temp, uint32_t>> reloads;1130std::map<Temp, uint32_t> spills;1131/* rename and reload operands */1132for (Operand& op : instr->operands) {1133if (!op.isTemp())1134continue;1135if (current_spills.find(op.getTemp()) == current_spills.end()) {1136/* the Operand is in register: check if it was renamed */1137if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())1138op.setTemp(ctx.renames[block_idx][op.getTemp()]);1139/* prevent it's definining instruction from being DCE'd if it could be rematerialized */1140else if (ctx.remat.count(op.getTemp()))1141ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;1142continue;1143}1144/* the Operand is spilled: add it to reloads */1145Temp new_tmp = ctx.program->allocateTmp(op.regClass());1146ctx.renames[block_idx][op.getTemp()] = new_tmp;1147reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);1148current_spills.erase(op.getTemp());1149op.setTemp(new_tmp);1150spilled_registers -= new_tmp;1151}11521153/* check if register demand is low enough before and after the current instruction */1154if (block->register_demand.exceeds(ctx.target_pressure)) {11551156RegisterDemand new_demand = ctx.register_demand[block_idx][idx];1157new_demand.update(get_demand_before(ctx, block_idx, idx));11581159assert(!local_next_use_distance.empty());11601161/* if reg pressure is too high, spill variable with furthest next use */1162while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {1163unsigned distance = 0;1164Temp to_spill;1165bool do_rematerialize = false;1166RegType type = RegType::sgpr;1167if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)1168type = RegType::vgpr;11691170for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {1171if (pair.first.type() != type)1172continue;1173bool can_rematerialize = ctx.remat.count(pair.first);1174if (((pair.second > distance && can_rematerialize == do_rematerialize) ||1175(can_rematerialize && !do_rematerialize && pair.second > idx)) &&1176current_spills.find(pair.first) == current_spills.end() &&1177ctx.spills_exit[block_idx].find(pair.first) ==1178ctx.spills_exit[block_idx].end()) {1179to_spill = pair.first;1180distance = pair.second;1181do_rematerialize = can_rematerialize;1182}1183}11841185assert(distance != 0 && distance > idx);1186uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass());11871188/* add interferences with currently spilled variables */1189for (std::pair<Temp, uint32_t> pair : current_spills)1190ctx.add_interference(spill_id, pair.second);1191for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads)1192ctx.add_interference(spill_id, pair.second.second);11931194current_spills[to_spill] = spill_id;1195spilled_registers += to_spill;11961197/* rename if necessary */1198if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {1199to_spill = ctx.renames[block_idx][to_spill];1200}12011202/* add spill to new instructions */1203aco_ptr<Pseudo_instruction> spill{1204create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};1205spill->operands[0] = Operand(to_spill);1206spill->operands[1] = Operand::c32(spill_id);1207instructions.emplace_back(std::move(spill));1208}1209}12101211/* add reloads and instruction to new instructions */1212for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {1213aco_ptr<Instruction> reload =1214do_reload(ctx, pair.second.first, pair.first, pair.second.second);1215instructions.emplace_back(std::move(reload));1216}1217instructions.emplace_back(std::move(instr));1218idx++;1219}12201221block->instructions = std::move(instructions);1222ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());1223}12241225void1226spill_block(spill_ctx& ctx, unsigned block_idx)1227{1228Block* block = &ctx.program->blocks[block_idx];12291230/* determine set of variables which are spilled at the beginning of the block */1231RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx);12321233/* add interferences for spilled variables */1234for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end();1235++it) {1236for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2)1237ctx.add_interference(it->second, it2->second);1238}12391240bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx;1241if (!is_loop_header) {1242/* add spill/reload code on incoming control flow edges */1243add_coupling_code(ctx, block, block_idx);1244}12451246std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];12471248/* check conditions to process this block */1249bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||1250!ctx.renames[block_idx].empty() || ctx.remat_used.size();12511252for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) {1253if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)1254process = true;1255}12561257if (process)1258process_block(ctx, block_idx, block, current_spills, spilled_registers);1259else1260ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());12611262ctx.processed[block_idx] = true;12631264/* check if the next block leaves the current loop */1265if (block->loop_nest_depth == 0 ||1266ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth)1267return;12681269Block* loop_header = ctx.loop_header.top();12701271/* preserve original renames at end of loop header block */1272std::map<Temp, Temp> renames = std::move(ctx.renames[loop_header->index]);12731274/* add coupling code to all loop header predecessors */1275add_coupling_code(ctx, loop_header, loop_header->index);12761277/* propagate new renames through loop: i.e. repair the SSA */1278renames.swap(ctx.renames[loop_header->index]);1279for (std::pair<Temp, Temp> rename : renames) {1280for (unsigned idx = loop_header->index; idx <= block_idx; idx++) {1281Block& current = ctx.program->blocks[idx];1282std::vector<aco_ptr<Instruction>>::iterator instr_it = current.instructions.begin();12831284/* first rename phis */1285while (instr_it != current.instructions.end()) {1286aco_ptr<Instruction>& phi = *instr_it;1287if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)1288break;1289/* no need to rename the loop header phis once again. this happened in1290* add_coupling_code() */1291if (idx == loop_header->index) {1292instr_it++;1293continue;1294}12951296for (Operand& op : phi->operands) {1297if (!op.isTemp())1298continue;1299if (op.getTemp() == rename.first)1300op.setTemp(rename.second);1301}1302instr_it++;1303}13041305/* variable is not live at beginning of this block */1306if (ctx.next_use_distances_start[idx].count(rename.first) == 0)1307continue;13081309/* if the variable is live at the block's exit, add rename */1310if (ctx.next_use_distances_end[idx].count(rename.first) != 0)1311ctx.renames[idx].insert(rename);13121313/* rename all uses in this block */1314bool renamed = false;1315while (!renamed && instr_it != current.instructions.end()) {1316aco_ptr<Instruction>& instr = *instr_it;1317for (Operand& op : instr->operands) {1318if (!op.isTemp())1319continue;1320if (op.getTemp() == rename.first) {1321op.setTemp(rename.second);1322/* we can stop with this block as soon as the variable is spilled */1323if (instr->opcode == aco_opcode::p_spill)1324renamed = true;1325}1326}1327instr_it++;1328}1329}1330}13311332/* remove loop header info from stack */1333ctx.loop_header.pop();1334}13351336Temp1337load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset,1338std::vector<aco_ptr<Instruction>>& instructions, unsigned offset,1339bool is_top_level)1340{1341Builder bld(ctx.program);1342if (is_top_level) {1343bld.reset(&instructions);1344} else {1345/* find p_logical_end */1346unsigned idx = instructions.size() - 1;1347while (instructions[idx]->opcode != aco_opcode::p_logical_end)1348idx--;1349bld.reset(&instructions, std::next(instructions.begin(), idx));1350}13511352Temp private_segment_buffer = ctx.program->private_segment_buffer;1353if (ctx.program->stage != compute_cs)1354private_segment_buffer =1355bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());13561357if (offset)1358scratch_offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),1359scratch_offset, Operand::c32(offset));13601361uint32_t rsrc_conf =1362S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx.program->wave_size == 64 ? 3 : 2);13631364if (ctx.program->chip_class >= GFX10) {1365rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |1366S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);1367} else if (ctx.program->chip_class <= GFX7) {1368/* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */1369rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |1370S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);1371}1372/* older generations need element size = 4 bytes. element size removed in GFX9 */1373if (ctx.program->chip_class <= GFX8)1374rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);13751376return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,1377Operand::c32(-1u), Operand::c32(rsrc_conf));1378}13791380void1381add_interferences(spill_ctx& ctx, std::vector<bool>& is_assigned, std::vector<uint32_t>& slots,1382std::vector<bool>& slots_used, unsigned id)1383{1384for (unsigned other : ctx.interferences[id].second) {1385if (!is_assigned[other])1386continue;13871388RegClass other_rc = ctx.interferences[other].first;1389unsigned slot = slots[other];1390std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true);1391}1392}13931394unsigned1395find_available_slot(std::vector<bool>& used, unsigned wave_size, unsigned size, bool is_sgpr,1396unsigned* num_slots)1397{1398unsigned wave_size_minus_one = wave_size - 1;1399unsigned slot = 0;14001401while (true) {1402bool available = true;1403for (unsigned i = 0; i < size; i++) {1404if (slot + i < used.size() && used[slot + i]) {1405available = false;1406break;1407}1408}1409if (!available) {1410slot++;1411continue;1412}14131414if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) {1415slot = align(slot, wave_size);1416continue;1417}14181419std::fill(used.begin(), used.end(), false);14201421if (slot + size > used.size())1422used.resize(slot + size);14231424return slot;1425}1426}14271428void1429assign_spill_slots_helper(spill_ctx& ctx, RegType type, std::vector<bool>& is_assigned,1430std::vector<uint32_t>& slots, unsigned* num_slots)1431{1432std::vector<bool> slots_used(*num_slots);14331434/* assign slots for ids with affinities first */1435for (std::vector<uint32_t>& vec : ctx.affinities) {1436if (ctx.interferences[vec[0]].first.type() != type)1437continue;14381439for (unsigned id : vec) {1440if (!ctx.is_reloaded[id])1441continue;14421443add_interferences(ctx, is_assigned, slots, slots_used, id);1444}14451446unsigned slot =1447find_available_slot(slots_used, ctx.wave_size, ctx.interferences[vec[0]].first.size(),1448type == RegType::sgpr, num_slots);14491450for (unsigned id : vec) {1451assert(!is_assigned[id]);14521453if (ctx.is_reloaded[id]) {1454slots[id] = slot;1455is_assigned[id] = true;1456}1457}1458}14591460/* assign slots for ids without affinities */1461for (unsigned id = 0; id < ctx.interferences.size(); id++) {1462if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type)1463continue;14641465add_interferences(ctx, is_assigned, slots, slots_used, id);14661467unsigned slot =1468find_available_slot(slots_used, ctx.wave_size, ctx.interferences[id].first.size(),1469type == RegType::sgpr, num_slots);14701471slots[id] = slot;1472is_assigned[id] = true;1473}14741475*num_slots = slots_used.size();1476}14771478void1479assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)1480{1481std::vector<uint32_t> slots(ctx.interferences.size());1482std::vector<bool> is_assigned(ctx.interferences.size());14831484/* first, handle affinities: just merge all interferences into both spill ids */1485for (std::vector<uint32_t>& vec : ctx.affinities) {1486for (unsigned i = 0; i < vec.size(); i++) {1487for (unsigned j = i + 1; j < vec.size(); j++) {1488assert(vec[i] != vec[j]);1489bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]];1490ctx.is_reloaded[vec[i]] = reloaded;1491ctx.is_reloaded[vec[j]] = reloaded;1492}1493}1494}1495for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++)1496for (ASSERTED uint32_t id : ctx.interferences[i].second)1497assert(i != id);14981499/* for each spill slot, assign as many spill ids as possible */1500unsigned sgpr_spill_slots = 0, vgpr_spill_slots = 0;1501assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &sgpr_spill_slots);1502assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &vgpr_spill_slots);15031504for (unsigned id = 0; id < is_assigned.size(); id++)1505assert(is_assigned[id] || !ctx.is_reloaded[id]);15061507for (std::vector<uint32_t>& vec : ctx.affinities) {1508for (unsigned i = 0; i < vec.size(); i++) {1509for (unsigned j = i + 1; j < vec.size(); j++) {1510assert(is_assigned[vec[i]] == is_assigned[vec[j]]);1511if (!is_assigned[vec[i]])1512continue;1513assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]);1514assert(ctx.interferences[vec[i]].first.type() ==1515ctx.interferences[vec[j]].first.type());1516assert(slots[vec[i]] == slots[vec[j]]);1517}1518}1519}15201521/* hope, we didn't mess up */1522std::vector<Temp> vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size);1523assert(vgpr_spill_temps.size() <= spills_to_vgpr);15241525/* replace pseudo instructions with actual hardware instructions */1526Temp scratch_offset = ctx.program->scratch_offset, scratch_rsrc = Temp();1527unsigned last_top_level_block_idx = 0;1528std::vector<bool> reload_in_loop(vgpr_spill_temps.size());1529for (Block& block : ctx.program->blocks) {15301531/* after loops, we insert a user if there was a reload inside the loop */1532if (block.loop_nest_depth == 0) {1533int end_vgprs = 0;1534for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {1535if (reload_in_loop[i])1536end_vgprs++;1537}15381539if (end_vgprs > 0) {1540aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(1541aco_opcode::p_end_linear_vgpr, Format::PSEUDO, end_vgprs, 0)};1542int k = 0;1543for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {1544if (reload_in_loop[i])1545destr->operands[k++] = Operand(vgpr_spill_temps[i]);1546reload_in_loop[i] = false;1547}1548/* find insertion point */1549std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();1550while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)1551++it;1552block.instructions.insert(it, std::move(destr));1553}1554}15551556if (block.kind & block_kind_top_level && !block.linear_preds.empty()) {1557last_top_level_block_idx = block.index;15581559/* check if any spilled variables use a created linear vgpr, otherwise destroy them */1560for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {1561if (vgpr_spill_temps[i] == Temp())1562continue;15631564bool can_destroy = true;1565for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[block.linear_preds[0]]) {15661567if (ctx.interferences[pair.second].first.type() == RegType::sgpr &&1568slots[pair.second] / ctx.wave_size == i) {1569can_destroy = false;1570break;1571}1572}1573if (can_destroy)1574vgpr_spill_temps[i] = Temp();1575}1576}15771578std::vector<aco_ptr<Instruction>>::iterator it;1579std::vector<aco_ptr<Instruction>> instructions;1580instructions.reserve(block.instructions.size());1581Builder bld(ctx.program, &instructions);1582for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {15831584if ((*it)->opcode == aco_opcode::p_spill) {1585uint32_t spill_id = (*it)->operands[1].constantValue();15861587if (!ctx.is_reloaded[spill_id]) {1588/* never reloaded, so don't spill */1589} else if (!is_assigned[spill_id]) {1590unreachable("No spill slot assigned for spill id");1591} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {1592/* spill vgpr */1593ctx.program->config->spilled_vgprs += (*it)->operands[0].size();1594uint32_t spill_slot = slots[spill_id];1595bool add_offset_to_sgpr =1596ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +1597vgpr_spill_slots * 4 >15984096;1599unsigned base_offset =1600add_offset_to_sgpr1601? 01602: ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;16031604/* check if the scratch resource descriptor already exists */1605if (scratch_rsrc == Temp()) {1606unsigned offset =1607add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;1608scratch_rsrc = load_scratch_resource(1609ctx, scratch_offset,1610last_top_level_block_idx == block.index1611? instructions1612: ctx.program->blocks[last_top_level_block_idx].instructions,1613offset, last_top_level_block_idx == block.index);1614}16151616unsigned offset = base_offset + spill_slot * 4;1617aco_opcode opcode = aco_opcode::buffer_store_dword;1618assert((*it)->operands[0].isTemp());1619Temp temp = (*it)->operands[0].getTemp();1620assert(temp.type() == RegType::vgpr && !temp.is_linear());1621if (temp.size() > 1) {1622Instruction* split{create_instruction<Pseudo_instruction>(1623aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};1624split->operands[0] = Operand(temp);1625for (unsigned i = 0; i < temp.size(); i++)1626split->definitions[i] = bld.def(v1);1627bld.insert(split);1628for (unsigned i = 0; i < temp.size(); i++) {1629Instruction* instr =1630bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,1631split->definitions[i].getTemp(), offset + i * 4, false, true);1632instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);1633}1634} else {1635Instruction* instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset,1636temp, offset, false, true);1637instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);1638}1639} else {1640ctx.program->config->spilled_sgprs += (*it)->operands[0].size();16411642uint32_t spill_slot = slots[spill_id];16431644/* check if the linear vgpr already exists */1645if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {1646Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());1647vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;1648aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(1649aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};1650create->definitions[0] = Definition(linear_vgpr);1651/* find the right place to insert this definition */1652if (last_top_level_block_idx == block.index) {1653/* insert right before the current instruction */1654instructions.emplace_back(std::move(create));1655} else {1656assert(last_top_level_block_idx < block.index);1657/* insert before the branch at last top level block */1658std::vector<aco_ptr<Instruction>>& block_instrs =1659ctx.program->blocks[last_top_level_block_idx].instructions;1660block_instrs.insert(std::prev(block_instrs.end()), std::move(create));1661}1662}16631664/* spill sgpr: just add the vgpr temp to operands */1665Pseudo_instruction* spill =1666create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);1667spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);1668spill->operands[1] = Operand::c32(spill_slot % ctx.wave_size);1669spill->operands[2] = (*it)->operands[0];1670instructions.emplace_back(aco_ptr<Instruction>(spill));1671}16721673} else if ((*it)->opcode == aco_opcode::p_reload) {1674uint32_t spill_id = (*it)->operands[0].constantValue();1675assert(ctx.is_reloaded[spill_id]);16761677if (!is_assigned[spill_id]) {1678unreachable("No spill slot assigned for spill id");1679} else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) {1680/* reload vgpr */1681uint32_t spill_slot = slots[spill_id];1682bool add_offset_to_sgpr =1683ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +1684vgpr_spill_slots * 4 >16854096;1686unsigned base_offset =1687add_offset_to_sgpr1688? 01689: ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;16901691/* check if the scratch resource descriptor already exists */1692if (scratch_rsrc == Temp()) {1693unsigned offset =1694add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;1695scratch_rsrc = load_scratch_resource(1696ctx, scratch_offset,1697last_top_level_block_idx == block.index1698? instructions1699: ctx.program->blocks[last_top_level_block_idx].instructions,1700offset, last_top_level_block_idx == block.index);1701}17021703unsigned offset = base_offset + spill_slot * 4;1704aco_opcode opcode = aco_opcode::buffer_load_dword;1705Definition def = (*it)->definitions[0];1706if (def.size() > 1) {1707Instruction* vec{create_instruction<Pseudo_instruction>(1708aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)};1709vec->definitions[0] = def;1710for (unsigned i = 0; i < def.size(); i++) {1711Temp tmp = bld.tmp(v1);1712vec->operands[i] = Operand(tmp);1713Instruction* instr =1714bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1),1715scratch_offset, offset + i * 4, false, true);1716instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);1717}1718bld.insert(vec);1719} else {1720Instruction* instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1),1721scratch_offset, offset, false, true);1722instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);1723}1724} else {1725uint32_t spill_slot = slots[spill_id];1726reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0;17271728/* check if the linear vgpr already exists */1729if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {1730Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());1731vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;1732aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(1733aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};1734create->definitions[0] = Definition(linear_vgpr);1735/* find the right place to insert this definition */1736if (last_top_level_block_idx == block.index) {1737/* insert right before the current instruction */1738instructions.emplace_back(std::move(create));1739} else {1740assert(last_top_level_block_idx < block.index);1741/* insert before the branch at last top level block */1742std::vector<aco_ptr<Instruction>>& block_instrs =1743ctx.program->blocks[last_top_level_block_idx].instructions;1744block_instrs.insert(std::prev(block_instrs.end()), std::move(create));1745}1746}17471748/* reload sgpr: just add the vgpr temp to operands */1749Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(1750aco_opcode::p_reload, Format::PSEUDO, 2, 1);1751reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);1752reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);1753reload->definitions[0] = (*it)->definitions[0];1754instructions.emplace_back(aco_ptr<Instruction>(reload));1755}1756} else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {1757instructions.emplace_back(std::move(*it));1758}1759}1760block.instructions = std::move(instructions);1761}17621763/* update required scratch memory */1764ctx.program->config->scratch_bytes_per_wave +=1765align(vgpr_spill_slots * 4 * ctx.program->wave_size, 1024);17661767/* SSA elimination inserts copies for logical phis right before p_logical_end1768* So if a linear vgpr is used between that p_logical_end and the branch,1769* we need to ensure logical phis don't choose a definition which aliases1770* the linear vgpr.1771* TODO: Moving the spills and reloads to before p_logical_end might produce1772* slightly better code. */1773for (Block& block : ctx.program->blocks) {1774/* loops exits are already handled */1775if (block.logical_preds.size() <= 1)1776continue;17771778bool has_logical_phis = false;1779for (aco_ptr<Instruction>& instr : block.instructions) {1780if (instr->opcode == aco_opcode::p_phi) {1781has_logical_phis = true;1782break;1783} else if (instr->opcode != aco_opcode::p_linear_phi) {1784break;1785}1786}1787if (!has_logical_phis)1788continue;17891790std::set<Temp> vgprs;1791for (unsigned pred_idx : block.logical_preds) {1792Block& pred = ctx.program->blocks[pred_idx];1793for (int i = pred.instructions.size() - 1; i >= 0; i--) {1794aco_ptr<Instruction>& pred_instr = pred.instructions[i];1795if (pred_instr->opcode == aco_opcode::p_logical_end) {1796break;1797} else if (pred_instr->opcode == aco_opcode::p_spill ||1798pred_instr->opcode == aco_opcode::p_reload) {1799vgprs.insert(pred_instr->operands[0].getTemp());1800}1801}1802}1803if (!vgprs.size())1804continue;18051806aco_ptr<Instruction> destr{create_instruction<Pseudo_instruction>(1807aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vgprs.size(), 0)};1808int k = 0;1809for (Temp tmp : vgprs) {1810destr->operands[k++] = Operand(tmp);1811}1812/* find insertion point */1813std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();1814while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)1815++it;1816block.instructions.insert(it, std::move(destr));1817}1818}18191820} /* end namespace */18211822void1823spill(Program* program, live& live_vars)1824{1825program->config->spilled_vgprs = 0;1826program->config->spilled_sgprs = 0;18271828program->progress = CompilationProgress::after_spilling;18291830/* no spilling when register pressure is low enough */1831if (program->num_waves > 0)1832return;18331834/* lower to CSSA before spilling to ensure correctness w.r.t. phis */1835lower_to_cssa(program, live_vars);18361837/* calculate target register demand */1838const RegisterDemand demand = program->max_reg_demand; /* current max */1839const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);1840const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);1841uint16_t extra_vgprs = 0;1842uint16_t extra_sgprs = 0;18431844/* calculate extra VGPRs required for spilling SGPRs */1845if (demand.sgpr > sgpr_limit) {1846unsigned sgpr_spills = demand.sgpr - sgpr_limit;1847extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;1848}1849/* add extra SGPRs required for spilling VGPRs */1850if (demand.vgpr + extra_vgprs > vgpr_limit) {1851extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */1852if (demand.sgpr + extra_sgprs > sgpr_limit) {1853/* re-calculate in case something has changed */1854unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;1855extra_vgprs = DIV_ROUND_UP(sgpr_spills, program->wave_size) + 1;1856}1857}1858/* the spiller has to target the following register demand */1859const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);18601861/* initialize ctx */1862spill_ctx ctx(target, program, live_vars.register_demand);1863compute_global_next_uses(ctx);1864get_rematerialize_info(ctx);18651866/* create spills and reloads */1867for (unsigned i = 0; i < program->blocks.size(); i++)1868spill_block(ctx, i);18691870/* assign spill slots and DCE rematerialized code */1871assign_spill_slots(ctx, extra_vgprs);18721873/* update live variable information */1874live_vars = live_var_analysis(program);18751876assert(program->num_waves > 0);1877}18781879} // namespace aco188018811882