Path: blob/21.2-virgl/src/amd/compiler/aco_lower_to_cssa.cpp
4550 views
/*1* Copyright © 2019 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_builder.h"25#include "aco_ir.h"2627#include <algorithm>28#include <map>29#include <unordered_map>30#include <vector>3132/*33* Implements an algorithm to lower to Concentional SSA Form (CSSA).34* After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"35* by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,36*37* By lowering the IR to CSSA, the insertion of parallelcopies is separated from38* the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.39* The algorithm coalesces non-interfering phi-resources while taking value-equality40* into account. Re-indexes the SSA-defs.41*/4243namespace aco {44namespace {4546typedef std::vector<Temp> merge_set;4748struct copy {49Definition def;50Operand op;51};5253struct merge_node {54Operand value = Operand(); /* original value: can be an SSA-def or constant value */55uint32_t index = -1u; /* index into the vector of merge sets */56uint32_t defined_at = -1u; /* defining block */5758/* we also remember two dominating defs with the same value: */59Temp equal_anc_in = Temp(); /* within the same merge set */60Temp equal_anc_out = Temp(); /* from a different set */61};6263struct cssa_ctx {64Program* program;65std::vector<IDSet>& live_out; /* live-out sets per block */66std::vector<std::vector<copy>> parallelcopies; /* copies per block */67std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */68std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */69};7071/* create (virtual) parallelcopies for each phi instruction and72* already merge copy-definitions with phi-defs into merge sets */73void74collect_parallelcopies(cssa_ctx& ctx)75{76ctx.parallelcopies.resize(ctx.program->blocks.size());77Builder bld(ctx.program);78for (Block& block : ctx.program->blocks) {79for (aco_ptr<Instruction>& phi : block.instructions) {80if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)81break;8283const Definition& def = phi->definitions[0];8485/* if the definition is not temp, it is the exec mask.86* We can reload the exec mask directly from the spill slot.87*/88if (!def.isTemp())89continue;9091std::vector<unsigned>& preds =92phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;93uint32_t index = ctx.merge_sets.size();94merge_set set;9596bool has_preheader_copy = false;97for (unsigned i = 0; i < phi->operands.size(); i++) {98Operand op = phi->operands[i];99if (op.isUndefined())100continue;101102if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {103/* SGPR inline constants and literals on GFX10+ can be spilled104* and reloaded directly (without intermediate register) */105if (op.isConstant()) {106if (ctx.program->chip_class >= GFX10)107continue;108if (op.size() == 1 && !op.isLiteral())109continue;110} else {111assert(op.isFixed() && op.physReg() == exec);112continue;113}114}115116/* create new temporary and rename operands */117Temp tmp = bld.tmp(def.regClass());118ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});119phi->operands[i] = Operand(tmp);120121/* place the new operands in the same merge set */122set.emplace_back(tmp);123ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};124125/* update the liveness information */126if (op.isKill())127ctx.live_out[preds[i]].erase(op.tempId());128ctx.live_out[preds[i]].insert(tmp.id());129130has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;131}132133if (set.empty())134continue;135136/* place the definition in dominance-order */137if (def.isTemp()) {138if (has_preheader_copy)139set.emplace(std::next(set.begin()), def.getTemp());140else if (block.kind & block_kind_loop_header)141set.emplace(set.begin(), def.getTemp());142else143set.emplace_back(def.getTemp());144ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};145}146ctx.merge_sets.emplace_back(set);147}148}149}150151/* check whether the definition of a comes after b. */152inline bool153defined_after(cssa_ctx& ctx, Temp a, Temp b)154{155merge_node& node_a = ctx.merge_node_table[a.id()];156merge_node& node_b = ctx.merge_node_table[b.id()];157if (node_a.defined_at == node_b.defined_at)158return a.id() > b.id();159160return node_a.defined_at > node_b.defined_at;161}162163/* check whether a dominates b where b is defined after a */164inline bool165dominates(cssa_ctx& ctx, Temp a, Temp b)166{167assert(defined_after(ctx, b, a));168merge_node& node_a = ctx.merge_node_table[a.id()];169merge_node& node_b = ctx.merge_node_table[b.id()];170unsigned idom = node_b.defined_at;171while (idom > node_a.defined_at)172idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom173: ctx.program->blocks[idom].linear_idom;174175return idom == node_a.defined_at;176}177178/* check intersection between var and parent:179* We already know that parent dominates var. */180inline bool181intersects(cssa_ctx& ctx, Temp var, Temp parent)182{183merge_node& node_var = ctx.merge_node_table[var.id()];184merge_node& node_parent = ctx.merge_node_table[parent.id()];185assert(node_var.index != node_parent.index);186uint32_t block_idx = node_var.defined_at;187188/* if the parent is live-out at the definition block of var, they intersect */189bool parent_live = ctx.live_out[block_idx].count(parent.id());190if (parent_live)191return true;192193/* parent is defined in a different block than var */194if (node_parent.defined_at < node_var.defined_at) {195/* if the parent is not live-in, they don't interfere */196std::vector<uint32_t>& preds = var.type() == RegType::vgpr197? ctx.program->blocks[block_idx].logical_preds198: ctx.program->blocks[block_idx].linear_preds;199for (uint32_t pred : preds) {200if (!ctx.live_out[pred].count(parent.id()))201return false;202}203}204205for (const copy& cp : ctx.parallelcopies[block_idx]) {206/* if var is defined at the edge, they don't intersect */207if (cp.def.getTemp() == var)208return false;209if (cp.op.isTemp() && cp.op.getTemp() == parent)210parent_live = true;211}212/* if the parent is live at the edge, they intersect */213if (parent_live)214return true;215216/* both, parent and var, are present in the same block */217const Block& block = ctx.program->blocks[block_idx];218for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {219/* if the parent was not encountered yet, it can only be used by a phi */220if (is_phi(it->get()))221break;222223for (const Definition& def : (*it)->definitions) {224if (!def.isTemp())225continue;226/* if parent was not found yet, they don't intersect */227if (def.getTemp() == var)228return false;229}230231for (const Operand& op : (*it)->operands) {232if (!op.isTemp())233continue;234/* if the var was defined before this point, they intersect */235if (op.getTemp() == parent)236return true;237}238}239240return false;241}242243/* check interference between var and parent:244* i.e. they have different values and intersect.245* If parent and var share the same value, also updates the equal ancestor. */246inline bool247interference(cssa_ctx& ctx, Temp var, Temp parent)248{249assert(var != parent);250merge_node& node_var = ctx.merge_node_table[var.id()];251node_var.equal_anc_out = Temp();252253if (node_var.index == ctx.merge_node_table[parent.id()].index) {254/* check/update in other set */255parent = ctx.merge_node_table[parent.id()].equal_anc_out;256}257258Temp tmp = parent;259/* check if var intersects with parent or any equal-valued ancestor */260while (tmp != Temp() && !intersects(ctx, var, tmp)) {261merge_node& node_tmp = ctx.merge_node_table[tmp.id()];262tmp = node_tmp.equal_anc_in;263}264265/* no intersection found */266if (tmp == Temp())267return false;268269/* var and parent, same value, but in different sets */270if (node_var.value == ctx.merge_node_table[parent.id()].value) {271node_var.equal_anc_out = tmp;272return false;273}274275/* var and parent, different values and intersect */276return true;277}278279/* tries to merge set_b into set_a of given temporary and280* drops that temporary as it is being coalesced */281bool282try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)283{284auto def_node_it = ctx.merge_node_table.find(dst.id());285uint32_t index = def_node_it->second.index;286merge_set& set_a = ctx.merge_sets[index];287std::vector<Temp> dom; /* stack of the traversal */288merge_set union_set; /* the new merged merge-set */289uint32_t i_a = 0;290uint32_t i_b = 0;291292while (i_a < set_a.size() || i_b < set_b.size()) {293Temp current;294if (i_a == set_a.size())295current = set_b[i_b++];296else if (i_b == set_b.size())297current = set_a[i_a++];298/* else pick the one defined first */299else if (defined_after(ctx, set_a[i_a], set_b[i_b]))300current = set_b[i_b++];301else302current = set_a[i_a++];303304while (!dom.empty() && !dominates(ctx, dom.back(), current))305dom.pop_back(); /* not the desired parent, remove */306307if (!dom.empty() && interference(ctx, current, dom.back()))308return false; /* intersection detected */309310dom.emplace_back(current); /* otherwise, keep checking */311if (current != dst)312union_set.emplace_back(current); /* maintain the new merge-set sorted */313}314315/* update hashmap */316for (Temp t : union_set) {317merge_node& node = ctx.merge_node_table[t.id()];318/* update the equal ancestors:319* i.e. the 'closest' dominating def with the same value */320Temp in = node.equal_anc_in;321Temp out = node.equal_anc_out;322if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))323node.equal_anc_in = out;324node.equal_anc_out = Temp();325/* update merge-set index */326node.index = index;327}328set_b = merge_set(); /* free the old set_b */329ctx.merge_sets[index] = union_set;330ctx.merge_node_table.erase(dst.id()); /* remove the temporary */331332return true;333}334335/* returns true if the copy can safely be omitted */336bool337try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)338{339/* we can only coalesce temporaries */340if (!copy.op.isTemp())341return false;342343/* try emplace a merge_node for the copy operand */344merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];345if (op_node.defined_at == -1u) {346/* find defining block of operand */347uint32_t pred = block_idx;348do {349block_idx = pred;350pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom351: ctx.program->blocks[pred].linear_idom;352} while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));353op_node.defined_at = block_idx;354op_node.value = copy.op;355}356357/* we can only coalesce copies of the same register class */358if (copy.op.regClass() != copy.def.regClass())359return false;360361/* check if this operand has not yet been coalesced */362if (op_node.index == -1u) {363merge_set op_set = merge_set{copy.op.getTemp()};364return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);365}366367/* check if this operand has been coalesced into the same set */368assert(ctx.merge_node_table.count(copy.def.tempId()));369if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)370return true;371372/* otherwise, try to coalesce both merge sets */373return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);374}375376/* node in the location-transfer-graph */377struct ltg_node {378copy cp;379uint32_t read_idx;380uint32_t num_uses = 0;381};382383/* emit the copies in an order that does not384* create interferences within a merge-set */385void386emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)387{388auto&& it = ltg.begin();389while (it != ltg.end()) {390const copy& cp = it->second.cp;391/* wrong regclass or still needed as operand */392if (cp.def.regClass().type() != type || it->second.num_uses > 0) {393++it;394continue;395}396397/* emit the copy */398bld.copy(cp.def, it->second.cp.op);399400/* update the location transfer graph */401if (it->second.read_idx != -1u) {402auto&& other = ltg.find(it->second.read_idx);403if (other != ltg.end())404other->second.num_uses--;405}406ltg.erase(it);407it = ltg.begin();408}409410/* count the number of remaining circular dependencies */411unsigned num = std::count_if(ltg.begin(), ltg.end(),412[&](auto& n) { return n.second.cp.def.regClass().type() == type; });413414/* if there are circular dependencies, we just emit them as single parallelcopy */415if (num) {416// TODO: this should be restricted to a feasible number of registers417// and otherwise use a temporary to avoid having to reload more (spilled)418// variables than we have registers.419aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(420aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};421it = ltg.begin();422for (unsigned i = 0; i < num; i++) {423while (it->second.cp.def.regClass().type() != type)424++it;425426copy->definitions[i] = it->second.cp.def;427copy->operands[i] = it->second.cp.op;428it = ltg.erase(it);429}430bld.insert(std::move(copy));431}432}433434/* either emits or coalesces all parallelcopies and435* renames the phi-operands accordingly. */436void437emit_parallelcopies(cssa_ctx& ctx)438{439std::unordered_map<uint32_t, Operand> renames;440441/* we iterate backwards to prioritize coalescing in else-blocks */442for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {443if (ctx.parallelcopies[i].empty())444continue;445446std::map<uint32_t, ltg_node> ltg;447/* first, try to coalesce all parallelcopies */448for (const copy& cp : ctx.parallelcopies[i]) {449if (try_coalesce_copy(ctx, cp, i)) {450renames.emplace(cp.def.tempId(), cp.op);451/* update liveness info */452ctx.live_out[i].erase(cp.def.tempId());453ctx.live_out[i].insert(cp.op.tempId());454} else {455uint32_t read_idx = -1u;456if (cp.op.isTemp())457read_idx = ctx.merge_node_table[cp.op.tempId()].index;458uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;459assert(write_idx != -1u);460ltg[write_idx] = {cp, read_idx};461}462}463464/* build location-transfer-graph */465for (auto& pair : ltg) {466if (pair.second.read_idx == -1u)467continue;468auto&& it = ltg.find(pair.second.read_idx);469if (it != ltg.end())470it->second.num_uses++;471}472473/* emit parallelcopies ordered */474Builder bld(ctx.program);475Block& block = ctx.program->blocks[i];476477/* emit VGPR copies */478auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool479{ return inst->opcode == aco_opcode::p_logical_end; };480auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);481bld.reset(&block.instructions, std::prev(it.base()));482emit_copies_block(bld, ltg, RegType::vgpr);483484/* emit SGPR copies */485aco_ptr<Instruction> branch = std::move(block.instructions.back());486block.instructions.pop_back();487bld.reset(&block.instructions);488emit_copies_block(bld, ltg, RegType::sgpr);489bld.insert(std::move(branch));490}491492/* finally, rename coalesced phi operands */493for (Block& block : ctx.program->blocks) {494for (aco_ptr<Instruction>& phi : block.instructions) {495if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)496break;497498for (Operand& op : phi->operands) {499if (!op.isTemp())500continue;501auto&& it = renames.find(op.tempId());502if (it != renames.end()) {503op = it->second;504renames.erase(it);505}506}507}508}509assert(renames.empty());510}511512} /* end namespace */513514void515lower_to_cssa(Program* program, live& live_vars)516{517reindex_ssa(program, live_vars.live_out);518cssa_ctx ctx = {program, live_vars.live_out};519collect_parallelcopies(ctx);520emit_parallelcopies(ctx);521522/* update live variable information */523live_vars = live_var_analysis(program);524}525} // namespace aco526527528