Path: blob/21.2-virgl/src/amd/compiler/aco_register_allocation.cpp
7086 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_ir.h"2526#include <algorithm>27#include <array>28#include <bitset>29#include <map>30#include <set>31#include <unordered_map>32#include <vector>3334namespace aco {35namespace {3637struct ra_ctx;3839unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,40unsigned idx, RegClass rc);41void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,42RegClass rc);43std::pair<unsigned, unsigned>44get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);45void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx,46PhysReg reg);4748struct assignment {49PhysReg reg;50RegClass rc;51uint8_t assigned = 0;52assignment() = default;53assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}54};5556struct ra_ctx {5758Program* program;59std::vector<assignment> assignments;60std::vector<std::unordered_map<unsigned, Temp>> renames;61std::vector<uint32_t> loop_header;62std::unordered_map<unsigned, Temp> orig_names;63std::unordered_map<unsigned, unsigned> affinities;64std::unordered_map<unsigned, Instruction*> vectors;65std::unordered_map<unsigned, Instruction*> split_vectors;66aco_ptr<Instruction> pseudo_dummy;67uint16_t max_used_sgpr = 0;68uint16_t max_used_vgpr = 0;69uint16_t sgpr_limit;70uint16_t vgpr_limit;71std::bitset<512> war_hint;72std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */7374ra_test_policy policy;7576ra_ctx(Program* program_, ra_test_policy policy_)77: program(program_), assignments(program->peekAllocationId()),78renames(program->blocks.size()), policy(policy_)79{80pseudo_dummy.reset(81create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));82sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);83vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);84}85};8687/* Iterator type for making PhysRegInterval compatible with range-based for */88struct PhysRegIterator {89using difference_type = int;90using value_type = unsigned;91using reference = const unsigned&;92using pointer = const unsigned*;93using iterator_category = std::bidirectional_iterator_tag;9495PhysReg reg;9697PhysReg operator*() const { return reg; }9899PhysRegIterator& operator++()100{101reg.reg_b += 4;102return *this;103}104105PhysRegIterator& operator--()106{107reg.reg_b -= 4;108return *this;109}110111bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }112113bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }114115bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }116};117118/* Half-open register interval used in "sliding window"-style for-loops */119struct PhysRegInterval {120PhysReg lo_;121unsigned size;122123/* Inclusive lower bound */124PhysReg lo() const { return lo_; }125126/* Exclusive upper bound */127PhysReg hi() const { return PhysReg{lo() + size}; }128129PhysRegInterval& operator+=(uint32_t stride)130{131lo_ = PhysReg{lo_.reg() + stride};132return *this;133}134135bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }136137/* Construct a half-open interval, excluding the end register */138static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }139140bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }141142bool contains(const PhysRegInterval& needle) const143{144return needle.lo() >= lo() && needle.hi() <= hi();145}146147PhysRegIterator begin() const { return {lo_}; }148149PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }150};151152bool153intersects(const PhysRegInterval& a, const PhysRegInterval& b)154{155return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi()));156}157158/* Gets the stride for full (non-subdword) registers */159uint32_t160get_stride(RegClass rc)161{162if (rc.type() == RegType::vgpr) {163return 1;164} else {165uint32_t size = rc.size();166if (size == 2) {167return 2;168} else if (size >= 4) {169return 4;170} else {171return 1;172}173}174}175176PhysRegInterval177get_reg_bounds(Program* program, RegType type)178{179if (type == RegType::vgpr) {180return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};181} else {182return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};183}184}185186struct DefInfo {187PhysRegInterval bounds;188uint8_t size;189uint8_t stride;190RegClass rc;191192DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)193{194size = rc.size();195stride = get_stride(rc);196197bounds = get_reg_bounds(ctx.program, rc.type());198199if (rc.is_subdword() && operand >= 0) {200/* stride in bytes */201stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);202} else if (rc.is_subdword()) {203std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);204stride = info.first;205if (info.second > rc.bytes()) {206rc = RegClass::get(rc.type(), info.second);207size = rc.size();208/* we might still be able to put the definition in the high half,209* but that's only useful for affinities and this information isn't210* used for them */211stride = align(stride, info.second);212if (!rc.is_subdword())213stride = DIV_ROUND_UP(stride, 4);214}215assert(stride > 0);216}217}218};219220class RegisterFile {221public:222RegisterFile() { regs.fill(0); }223224std::array<uint32_t, 512> regs;225std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;226227const uint32_t& operator[](PhysReg index) const { return regs[index]; }228229uint32_t& operator[](PhysReg index) { return regs[index]; }230231unsigned count_zero(PhysRegInterval reg_interval)232{233unsigned res = 0;234for (PhysReg reg : reg_interval)235res += !regs[reg];236return res;237}238239/* Returns true if any of the bytes in the given range are allocated or blocked */240bool test(PhysReg start, unsigned num_bytes)241{242for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {243assert(i <= 511);244if (regs[i] & 0x0FFFFFFF)245return true;246if (regs[i] == 0xF0000000) {247assert(subdword_regs.find(i) != subdword_regs.end());248for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {249if (subdword_regs[i][j])250return true;251}252}253}254return false;255}256257void block(PhysReg start, RegClass rc)258{259if (rc.is_subdword())260fill_subdword(start, rc.bytes(), 0xFFFFFFFF);261else262fill(start, rc.size(), 0xFFFFFFFF);263}264265bool is_blocked(PhysReg start)266{267if (regs[start] == 0xFFFFFFFF)268return true;269if (regs[start] == 0xF0000000) {270for (unsigned i = start.byte(); i < 4; i++)271if (subdword_regs[start][i] == 0xFFFFFFFF)272return true;273}274return false;275}276277bool is_empty_or_blocked(PhysReg start)278{279/* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the280* incremented value to 1 */281if (regs[start] == 0xF0000000) {282return subdword_regs[start][start.byte()] + 1 <= 1;283}284return regs[start] + 1 <= 1;285}286287void clear(PhysReg start, RegClass rc)288{289if (rc.is_subdword())290fill_subdword(start, rc.bytes(), 0);291else292fill(start, rc.size(), 0);293}294295void fill(Operand op)296{297if (op.regClass().is_subdword())298fill_subdword(op.physReg(), op.bytes(), op.tempId());299else300fill(op.physReg(), op.size(), op.tempId());301}302303void clear(Operand op) { clear(op.physReg(), op.regClass()); }304305void fill(Definition def)306{307if (def.regClass().is_subdword())308fill_subdword(def.physReg(), def.bytes(), def.tempId());309else310fill(def.physReg(), def.size(), def.tempId());311}312313void clear(Definition def) { clear(def.physReg(), def.regClass()); }314315unsigned get_id(PhysReg reg)316{317return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];318}319320private:321void fill(PhysReg start, unsigned size, uint32_t val)322{323for (unsigned i = 0; i < size; i++)324regs[start + i] = val;325}326327void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)328{329fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);330for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {331/* emplace or get */332std::array<uint32_t, 4>& sub =333subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;334for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)335sub[j] = val;336337if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {338subdword_regs.erase(i);339regs[i] = 0;340}341}342}343};344345std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,346const PhysRegInterval reg_interval);347348/* helper function for debugging */349UNUSED void350print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)351{352if (reg_file[reg] == 0xFFFFFFFF) {353printf("☐");354} else if (reg_file[reg]) {355const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);356if (show_subdword_alloc) {357const char* block_chars[] = {358// clang-format off359"?", "▘", "▝", "▀",360"▖", "▌", "▞", "▛",361"▗", "▚", "▐", "▜",362"▄", "▙", "▟", "▉"363// clang-format on364};365unsigned index = 0;366for (int i = 0; i < 4; ++i) {367if (reg_file.subdword_regs.at(reg)[i]) {368index |= 1 << i;369}370}371printf("%s", block_chars[index]);372} else {373/* Indicate filled register slot */374if (!has_adjacent_variable) {375printf("█");376} else {377/* Use a slightly shorter box to leave a small gap between adjacent variables */378printf("▉");379}380}381} else {382printf("·");383}384}385386/* helper function for debugging */387UNUSED void388print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)389{390PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);391char reg_char = vgprs ? 'v' : 's';392const int max_regs_per_line = 64;393394/* print markers */395printf(" ");396for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {397printf("%-3.2u ", i);398}399printf("\n");400401/* print usage */402auto line_begin_it = regs.begin();403while (line_begin_it != regs.end()) {404const int regs_in_line =405std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));406407if (line_begin_it == regs.begin()) {408printf("%cgprs: ", reg_char);409} else {410printf(" %+4d ", std::distance(regs.begin(), line_begin_it));411}412const auto line_end_it = std::next(line_begin_it, regs_in_line);413414for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {415bool has_adjacent_variable =416(std::next(reg_it) != line_end_it &&417reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);418print_reg(reg_file, *reg_it, has_adjacent_variable);419}420421line_begin_it = line_end_it;422printf("\n");423}424425const unsigned free_regs =426std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });427printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);428429/* print assignments ordered by registers */430std::map<PhysReg, std::pair<unsigned, unsigned>>431regs_to_vars; /* maps to byte size and temp id */432for (const auto& size_id : find_vars(ctx, reg_file, regs)) {433auto reg = ctx.assignments[size_id.second].reg;434ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);435assert(inserted.second);436}437438for (const auto& reg_and_var : regs_to_vars) {439const auto& first_reg = reg_and_var.first;440const auto& size_id = reg_and_var.second;441442printf("%%%u ", size_id.second);443if (ctx.orig_names.count(size_id.second) &&444ctx.orig_names[size_id.second].id() != size_id.second) {445printf("(was %%%d) ", ctx.orig_names[size_id.second].id());446}447printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());448PhysReg last_reg = first_reg.advance(size_id.first - 1);449if (first_reg.reg() != last_reg.reg()) {450assert(first_reg.byte() == 0 && last_reg.byte() == 3);451printf("-%d", last_reg.reg() - regs.lo());452}453printf("]");454if (first_reg.byte() != 0 || last_reg.byte() != 3) {455printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);456}457printf("\n");458}459}460461unsigned462get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,463RegClass rc)464{465/* v_readfirstlane_b32 cannot use SDWA */466if (instr->opcode == aco_opcode::p_as_uniform)467return 4;468if (instr->isPseudo() && chip >= GFX8)469return rc.bytes() % 2 == 0 ? 2 : 1;470471if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {472return 1;473} else if (can_use_SDWA(chip, instr, false)) {474return rc.bytes() % 2 == 0 ? 2 : 1;475} else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, 1)) {476return 2;477} else if (instr->isVOP3P()) {478return 2;479}480481switch (instr->opcode) {482case aco_opcode::ds_write_b8:483case aco_opcode::ds_write_b16: return chip >= GFX8 ? 2 : 4;484case aco_opcode::buffer_store_byte:485case aco_opcode::buffer_store_short:486case aco_opcode::flat_store_byte:487case aco_opcode::flat_store_short:488case aco_opcode::scratch_store_byte:489case aco_opcode::scratch_store_short:490case aco_opcode::global_store_byte:491case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;492default: break;493}494495return 4;496}497498void499add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,500RegClass rc)501{502chip_class chip = ctx.program->chip_class;503if (instr->isPseudo() || byte == 0)504return;505506assert(rc.bytes() <= 2);507508if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {509switch (byte) {510case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;511case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;512case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;513case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;514}515return;516} else if (can_use_SDWA(chip, instr, false)) {517aco_ptr<Instruction> tmp = convert_to_SDWA(chip, instr);518return;519} else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, byte / 2)) {520instr->vop3().opsel |= (byte / 2) << idx;521return;522} else if (instr->isVOP3P() && byte == 2) {523VOP3P_instruction& vop3p = instr->vop3p();524assert(!(vop3p.opsel_lo & (1 << idx)));525vop3p.opsel_lo |= 1 << idx;526vop3p.opsel_hi |= 1 << idx;527return;528}529530if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b8 && byte == 2) {531instr->opcode = aco_opcode::ds_write_b8_d16_hi;532return;533}534if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b16 && byte == 2) {535instr->opcode = aco_opcode::ds_write_b16_d16_hi;536return;537}538539if (chip >= GFX9 && byte == 2) {540if (instr->opcode == aco_opcode::buffer_store_byte)541instr->opcode = aco_opcode::buffer_store_byte_d16_hi;542else if (instr->opcode == aco_opcode::buffer_store_short)543instr->opcode = aco_opcode::buffer_store_short_d16_hi;544else if (instr->opcode == aco_opcode::flat_store_byte)545instr->opcode = aco_opcode::flat_store_byte_d16_hi;546else if (instr->opcode == aco_opcode::flat_store_short)547instr->opcode = aco_opcode::flat_store_short_d16_hi;548else if (instr->opcode == aco_opcode::scratch_store_byte)549instr->opcode = aco_opcode::scratch_store_byte_d16_hi;550else if (instr->opcode == aco_opcode::scratch_store_short)551instr->opcode = aco_opcode::scratch_store_short_d16_hi;552else if (instr->opcode == aco_opcode::global_store_byte)553instr->opcode = aco_opcode::global_store_byte_d16_hi;554else if (instr->opcode == aco_opcode::global_store_short)555instr->opcode = aco_opcode::global_store_short_d16_hi;556else557unreachable("Something went wrong: Impossible register assignment.");558}559}560561/* minimum_stride, bytes_written */562std::pair<unsigned, unsigned>563get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)564{565chip_class chip = program->chip_class;566567if (instr->isPseudo() && chip >= GFX8)568return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());569else if (instr->isPseudo())570return std::make_pair(4, rc.size() * 4u);571572unsigned bytes_written = chip >= GFX10 ? rc.bytes() : 4u;573switch (instr->opcode) {574case aco_opcode::v_mad_f16:575case aco_opcode::v_mad_u16:576case aco_opcode::v_mad_i16:577case aco_opcode::v_fma_f16:578case aco_opcode::v_div_fixup_f16:579case aco_opcode::v_interp_p2_f16: bytes_written = chip >= GFX9 ? rc.bytes() : 4u; break;580default: break;581}582bytes_written = bytes_written > 4 ? align(bytes_written, 4) : bytes_written;583bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u);584585if (can_use_SDWA(chip, instr, false)) {586return std::make_pair(rc.bytes(), rc.bytes());587} else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {588return std::make_pair(2u, bytes_written);589}590591switch (instr->opcode) {592case aco_opcode::buffer_load_ubyte_d16:593case aco_opcode::buffer_load_short_d16:594case aco_opcode::flat_load_ubyte_d16:595case aco_opcode::flat_load_short_d16:596case aco_opcode::scratch_load_ubyte_d16:597case aco_opcode::scratch_load_short_d16:598case aco_opcode::global_load_ubyte_d16:599case aco_opcode::global_load_short_d16:600case aco_opcode::ds_read_u8_d16:601case aco_opcode::ds_read_u16_d16:602if (chip >= GFX9 && !program->dev.sram_ecc_enabled)603return std::make_pair(2u, 2u);604else605return std::make_pair(2u, 4u);606case aco_opcode::v_fma_mixlo_f16: return std::make_pair(2u, 2u);607default: break;608}609610return std::make_pair(4u, bytes_written);611}612613void614add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)615{616RegClass rc = instr->definitions[idx].regClass();617chip_class chip = program->chip_class;618619if (instr->isPseudo()) {620return;621} else if (can_use_SDWA(chip, instr, false)) {622unsigned def_size = instr_info.definition_size[(int)instr->opcode];623if (reg.byte() || chip < GFX10 || def_size > rc.bytes() * 8u)624convert_to_SDWA(chip, instr);625return;626} else if (reg.byte() && rc.bytes() == 2 &&627can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {628VOP3_instruction& vop3 = instr->vop3();629if (reg.byte() == 2)630vop3.opsel |= (1 << 3); /* dst in high half */631return;632}633634if (reg.byte() == 2) {635if (instr->opcode == aco_opcode::v_fma_mixlo_f16)636instr->opcode = aco_opcode::v_fma_mixhi_f16;637else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)638instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;639else if (instr->opcode == aco_opcode::buffer_load_short_d16)640instr->opcode = aco_opcode::buffer_load_short_d16_hi;641else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)642instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;643else if (instr->opcode == aco_opcode::flat_load_short_d16)644instr->opcode = aco_opcode::flat_load_short_d16_hi;645else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)646instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;647else if (instr->opcode == aco_opcode::scratch_load_short_d16)648instr->opcode = aco_opcode::scratch_load_short_d16_hi;649else if (instr->opcode == aco_opcode::global_load_ubyte_d16)650instr->opcode = aco_opcode::global_load_ubyte_d16_hi;651else if (instr->opcode == aco_opcode::global_load_short_d16)652instr->opcode = aco_opcode::global_load_short_d16_hi;653else if (instr->opcode == aco_opcode::ds_read_u8_d16)654instr->opcode = aco_opcode::ds_read_u8_d16_hi;655else if (instr->opcode == aco_opcode::ds_read_u16_d16)656instr->opcode = aco_opcode::ds_read_u16_d16_hi;657else658unreachable("Something went wrong: Impossible register assignment.");659}660}661662void663adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)664{665uint16_t max_addressible_sgpr = ctx.sgpr_limit;666unsigned size = rc.size();667if (rc.type() == RegType::vgpr) {668assert(reg >= 256);669uint16_t hi = reg - 256 + size - 1;670ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);671} else if (reg + rc.size() <= max_addressible_sgpr) {672uint16_t hi = reg + size - 1;673ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));674}675}676677enum UpdateRenames {678rename_not_killed_ops = 0x1,679fill_killed_ops = 0x2,680};681MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);682683void684update_renames(ra_ctx& ctx, RegisterFile& reg_file,685std::vector<std::pair<Operand, Definition>>& parallelcopies,686aco_ptr<Instruction>& instr, UpdateRenames flags)687{688/* clear operands */689for (std::pair<Operand, Definition>& copy : parallelcopies) {690/* the definitions with id are not from this function and already handled */691if (copy.second.isTemp())692continue;693reg_file.clear(copy.first);694}695696/* allocate id's and rename operands: this is done transparently here */697auto it = parallelcopies.begin();698while (it != parallelcopies.end()) {699if (it->second.isTemp()) {700++it;701continue;702}703704/* check if we moved a definition: change the register and remove copy */705bool is_def = false;706for (Definition& def : instr->definitions) {707if (def.isTemp() && def.getTemp() == it->first.getTemp()) {708// FIXME: ensure that the definition can use this reg709def.setFixed(it->second.physReg());710reg_file.fill(def);711ctx.assignments[def.tempId()].reg = def.physReg();712it = parallelcopies.erase(it);713is_def = true;714break;715}716}717if (is_def)718continue;719720/* check if we moved another parallelcopy definition */721for (std::pair<Operand, Definition>& other : parallelcopies) {722if (!other.second.isTemp())723continue;724if (it->first.getTemp() == other.second.getTemp()) {725other.second.setFixed(it->second.physReg());726ctx.assignments[other.second.tempId()].reg = other.second.physReg();727it = parallelcopies.erase(it);728is_def = true;729/* check if we moved an operand, again */730bool fill = true;731for (Operand& op : instr->operands) {732if (op.isTemp() && op.tempId() == other.second.tempId()) {733// FIXME: ensure that the operand can use this reg734op.setFixed(other.second.physReg());735fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();736}737}738if (fill)739reg_file.fill(other.second);740break;741}742}743if (is_def)744continue;745746std::pair<Operand, Definition>& copy = *it;747copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));748ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());749assert(ctx.assignments.size() == ctx.program->peekAllocationId());750751/* check if we moved an operand */752bool first = true;753bool fill = true;754for (unsigned i = 0; i < instr->operands.size(); i++) {755Operand& op = instr->operands[i];756if (!op.isTemp())757continue;758if (op.tempId() == copy.first.tempId()) {759bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();760for (std::pair<Operand, Definition>& pc : parallelcopies) {761PhysReg def_reg = pc.second.physReg();762omit_renaming &= def_reg > copy.first.physReg()763? (copy.first.physReg() + copy.first.size() <= def_reg.reg())764: (def_reg + pc.second.size() <= copy.first.physReg().reg());765}766if (omit_renaming) {767if (first)768op.setFirstKill(true);769else770op.setKill(true);771first = false;772continue;773}774op.setTemp(copy.second.getTemp());775op.setFixed(copy.second.physReg());776777fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();778}779}780781if (fill)782reg_file.fill(copy.second);783784++it;785}786}787788std::pair<PhysReg, bool>789get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)790{791const PhysRegInterval& bounds = info.bounds;792uint32_t size = info.size;793uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;794RegClass rc = info.rc;795796DefInfo new_info = info;797new_info.rc = RegClass(rc.type(), size);798for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {799if (size % new_stride)800continue;801new_info.stride = new_stride;802std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);803if (res.second)804return res;805}806807auto is_free = [&](PhysReg reg_index)808{ return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };809810if (stride == 1) {811/* best fit algorithm: find the smallest gap to fit in the variable */812PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};813const unsigned max_gpr =814(rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;815816PhysRegIterator reg_it = bounds.begin();817const PhysRegIterator end_it =818std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));819while (reg_it != bounds.end()) {820/* Find the next chunk of available register slots */821reg_it = std::find_if(reg_it, end_it, is_free);822auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);823if (reg_it == bounds.end()) {824break;825}826827if (next_nonfree_it == end_it) {828/* All registers past max_used_gpr are free */829next_nonfree_it = bounds.end();830}831832PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);833834/* early return on exact matches */835if (size == gap.size) {836adjust_max_used_regs(ctx, rc, gap.lo());837return {gap.lo(), true};838}839840/* check if it fits and the gap size is smaller */841if (size < gap.size && gap.size < best_gap.size) {842best_gap = gap;843}844845/* Move past the processed chunk */846reg_it = next_nonfree_it;847}848849if (best_gap.size == UINT_MAX)850return {{}, false};851852/* find best position within gap by leaving a good stride for other variables*/853unsigned buffer = best_gap.size - size;854if (buffer > 1) {855if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||856((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||857((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))858best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};859}860861adjust_max_used_regs(ctx, rc, best_gap.lo());862return {best_gap.lo(), true};863}864865for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();866reg_win += stride) {867if (reg_file[reg_win.lo()] != 0) {868continue;869}870871bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);872if (is_valid) {873adjust_max_used_regs(ctx, rc, reg_win.lo());874return {reg_win.lo(), true};875}876}877878/* do this late because using the upper bytes of a register can require879* larger instruction encodings or copies880* TODO: don't do this in situations where it doesn't benefit */881if (rc.is_subdword()) {882for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {883assert(reg_file[PhysReg{entry.first}] == 0xF0000000);884if (!bounds.contains(PhysReg{entry.first}))885continue;886887for (unsigned i = 0; i < 4; i += info.stride) {888/* check if there's a block of free bytes large enough to hold the register */889bool reg_found =890std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],891[](unsigned v) { return v == 0; });892893/* check if also the neighboring reg is free if needed */894if (reg_found && i + rc.bytes() > 4)895reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);896897if (reg_found) {898PhysReg res{entry.first};899res.reg_b += i;900adjust_max_used_regs(ctx, rc, entry.first);901return {res, true};902}903}904}905}906907return {{}, false};908}909910/* collect variables from a register area and clear reg_file */911std::set<std::pair<unsigned, unsigned>>912find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)913{914std::set<std::pair<unsigned, unsigned>> vars;915for (PhysReg j : reg_interval) {916if (reg_file.is_blocked(j))917continue;918if (reg_file[j] == 0xF0000000) {919for (unsigned k = 0; k < 4; k++) {920unsigned id = reg_file.subdword_regs[j][k];921if (id) {922assignment& var = ctx.assignments[id];923vars.emplace(var.rc.bytes(), id);924}925}926} else if (reg_file[j] != 0) {927unsigned id = reg_file[j];928assignment& var = ctx.assignments[id];929vars.emplace(var.rc.bytes(), id);930}931}932return vars;933}934935/* collect variables from a register area and clear reg_file */936std::set<std::pair<unsigned, unsigned>>937collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)938{939std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);940for (std::pair<unsigned, unsigned> size_id : vars) {941assignment& var = ctx.assignments[size_id.second];942reg_file.clear(var.reg, var.rc);943}944return vars;945}946947bool948get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,949std::vector<std::pair<Operand, Definition>>& parallelcopies,950const std::set<std::pair<unsigned, unsigned>>& vars,951const PhysRegInterval bounds, aco_ptr<Instruction>& instr,952const PhysRegInterval def_reg)953{954/* variables are sorted from small sized to large */955/* NOTE: variables are also sorted by ID. this only affects a very small number of shaders956* slightly though. */957for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();958it != vars.rend(); ++it) {959unsigned id = it->second;960assignment& var = ctx.assignments[id];961DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);962uint32_t size = info.size;963964/* check if this is a dead operand, then we can re-use the space from the definition965* also use the correct stride for sub-dword operands */966bool is_dead_operand = false;967for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {968if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {969if (instr->operands[i].isKillBeforeDef())970is_dead_operand = true;971info = DefInfo(ctx, instr, var.rc, i);972break;973}974}975976std::pair<PhysReg, bool> res;977if (is_dead_operand) {978if (instr->opcode == aco_opcode::p_create_vector) {979PhysReg reg(def_reg.lo());980for (unsigned i = 0; i < instr->operands.size(); i++) {981if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {982res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&983!reg_file.test(reg, var.rc.bytes())};984break;985}986reg.reg_b += instr->operands[i].bytes();987}988if (!res.second)989res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};990} else {991info.bounds = def_reg;992res = get_reg_simple(ctx, reg_file, info);993}994} else {995/* Try to find space within the bounds but outside of the definition */996info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));997res = get_reg_simple(ctx, reg_file, info);998if (!res.second && def_reg.hi() <= bounds.hi()) {999unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);1000info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());1001res = get_reg_simple(ctx, reg_file, info);1002}1003}10041005if (res.second) {1006/* mark the area as blocked */1007reg_file.block(res.first, var.rc);10081009/* create parallelcopy pair (without definition id) */1010Temp tmp = Temp(id, var.rc);1011Operand pc_op = Operand(tmp);1012pc_op.setFixed(var.reg);1013Definition pc_def = Definition(res.first, pc_op.regClass());1014parallelcopies.emplace_back(pc_op, pc_def);1015continue;1016}10171018PhysReg best_pos = bounds.lo();1019unsigned num_moves = 0xFF;1020unsigned num_vars = 0;10211022/* we use a sliding window to find potential positions */1023unsigned stride = var.rc.is_subdword() ? 1 : info.stride;1024for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();1025reg_win += stride) {1026if (!is_dead_operand && intersects(reg_win, def_reg))1027continue;10281029/* second, check that we have at most k=num_moves elements in the window1030* and no element is larger than the currently processed one */1031unsigned k = 0;1032unsigned n = 0;1033unsigned last_var = 0;1034bool found = true;1035for (PhysReg j : reg_win) {1036if (reg_file[j] == 0 || reg_file[j] == last_var)1037continue;10381039if (reg_file.is_blocked(j) || k > num_moves) {1040found = false;1041break;1042}1043if (reg_file[j] == 0xF0000000) {1044k += 1;1045n++;1046continue;1047}1048/* we cannot split live ranges of linear vgprs */1049if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {1050found = false;1051break;1052}1053bool is_kill = false;1054for (const Operand& op : instr->operands) {1055if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {1056is_kill = true;1057break;1058}1059}1060if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {1061found = false;1062break;1063}10641065k += ctx.assignments[reg_file[j]].rc.size();1066last_var = reg_file[j];1067n++;1068if (k > num_moves || (k == num_moves && n <= num_vars)) {1069found = false;1070break;1071}1072}10731074if (found) {1075best_pos = reg_win.lo();1076num_moves = k;1077num_vars = n;1078}1079}10801081/* FIXME: we messed up and couldn't find space for the variables to be copied */1082if (num_moves == 0xFF)1083return false;10841085PhysRegInterval reg_win{best_pos, size};10861087/* collect variables and block reg file */1088std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);10891090/* mark the area as blocked */1091reg_file.block(reg_win.lo(), var.rc);1092adjust_max_used_regs(ctx, var.rc, reg_win.lo());10931094if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))1095return false;10961097/* create parallelcopy pair (without definition id) */1098Temp tmp = Temp(id, var.rc);1099Operand pc_op = Operand(tmp);1100pc_op.setFixed(var.reg);1101Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());1102parallelcopies.emplace_back(pc_op, pc_def);1103}11041105return true;1106}11071108std::pair<PhysReg, bool>1109get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,1110std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,1111aco_ptr<Instruction>& instr)1112{1113const PhysRegInterval& bounds = info.bounds;1114uint32_t size = info.size;1115uint32_t stride = info.stride;1116RegClass rc = info.rc;11171118/* check how many free regs we have */1119unsigned regs_free = reg_file.count_zero(bounds);11201121/* mark and count killed operands */1122unsigned killed_ops = 0;1123std::bitset<256> is_killed_operand; /* per-register */1124for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {1125Operand& op = instr->operands[j];1126if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&1127!reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {1128assert(op.isFixed());11291130for (unsigned i = 0; i < op.size(); ++i) {1131is_killed_operand[(op.physReg() & 0xff) + i] = true;1132}11331134killed_ops += op.getTemp().size();1135}1136}11371138assert(regs_free >= size);1139/* we might have to move dead operands to dst in order to make space */1140unsigned op_moves = 0;11411142if (size > (regs_free - killed_ops))1143op_moves = size - (regs_free - killed_ops);11441145/* find the best position to place the definition */1146PhysRegInterval best_win = {bounds.lo(), size};1147unsigned num_moves = 0xFF;1148unsigned num_vars = 0;11491150/* we use a sliding window to check potential positions */1151for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();1152reg_win += stride) {1153/* first check if the register window starts in the middle of an1154* allocated variable: this is what we have to fix to allow for1155* num_moves > size */1156if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&1157reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))1158continue;1159if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&1160reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))1161continue;11621163/* second, check that we have at most k=num_moves elements in the window1164* and no element is larger than the currently processed one */1165unsigned k = op_moves;1166unsigned n = 0;1167unsigned remaining_op_moves = op_moves;1168unsigned last_var = 0;1169bool found = true;1170bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;1171for (const PhysReg j : reg_win) {1172/* dead operands effectively reduce the number of estimated moves */1173if (is_killed_operand[j & 0xFF]) {1174if (remaining_op_moves) {1175k--;1176remaining_op_moves--;1177}1178continue;1179}11801181if (reg_file[j] == 0 || reg_file[j] == last_var)1182continue;11831184if (reg_file[j] == 0xF0000000) {1185k += 1;1186n++;1187continue;1188}11891190if (ctx.assignments[reg_file[j]].rc.size() >= size) {1191found = false;1192break;1193}11941195/* we cannot split live ranges of linear vgprs */1196if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {1197found = false;1198break;1199}12001201k += ctx.assignments[reg_file[j]].rc.size();1202n++;1203last_var = reg_file[j];1204}12051206if (!found || k > num_moves)1207continue;1208if (k == num_moves && n < num_vars)1209continue;1210if (!aligned && k == num_moves && n == num_vars)1211continue;12121213if (found) {1214best_win = reg_win;1215num_moves = k;1216num_vars = n;1217}1218}12191220if (num_moves == 0xFF)1221return {{}, false};12221223/* now, we figured the placement for our definition */1224RegisterFile tmp_file(reg_file);1225std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);12261227if (instr->opcode == aco_opcode::p_create_vector) {1228/* move killed operands which aren't yet at the correct position (GFX9+)1229* or which are in the definition space */1230PhysReg reg = best_win.lo();1231for (Operand& op : instr->operands) {1232if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {1233if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||1234(op.physReg().advance(op.bytes()) > best_win.lo() &&1235op.physReg() < best_win.hi()))) {1236vars.emplace(op.bytes(), op.tempId());1237tmp_file.clear(op);1238} else {1239tmp_file.fill(op);1240}1241}1242reg.reg_b += op.bytes();1243}1244} else if (!is_phi(instr)) {1245/* re-enable killed operands */1246for (Operand& op : instr->operands) {1247if (op.isTemp() && op.isFirstKillBeforeDef())1248tmp_file.fill(op);1249}1250}12511252std::vector<std::pair<Operand, Definition>> pc;1253if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))1254return {{}, false};12551256parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());12571258adjust_max_used_regs(ctx, rc, best_win.lo());1259return {best_win.lo(), true};1260}12611262bool1263get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,1264PhysReg reg)1265{1266/* catch out-of-range registers */1267if (reg >= PhysReg{512})1268return false;12691270std::pair<unsigned, unsigned> sdw_def_info;1271if (rc.is_subdword())1272sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);12731274if (rc.is_subdword() && reg.byte() % sdw_def_info.first)1275return false;1276if (!rc.is_subdword() && reg.byte())1277return false;12781279if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)1280return false;12811282PhysRegInterval reg_win = {reg, rc.size()};1283PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());1284PhysRegInterval vcc_win = {vcc, 2};1285/* VCC is outside the bounds */1286bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);1287bool is_m0 = rc == s1 && reg == m0;1288if (!bounds.contains(reg_win) && !is_vcc && !is_m0)1289return false;12901291if (rc.is_subdword()) {1292PhysReg test_reg;1293test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);1294if (reg_file.test(test_reg, sdw_def_info.second))1295return false;1296} else {1297if (reg_file.test(reg, rc.bytes()))1298return false;1299}13001301adjust_max_used_regs(ctx, rc, reg_win.lo());1302return true;1303}13041305bool1306increase_register_file(ra_ctx& ctx, RegType type)1307{1308if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {1309update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,1310ctx.program->max_reg_demand.sgpr));1311} else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {1312update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,1313ctx.program->max_reg_demand.sgpr + 1));1314} else {1315return false;1316}1317return true;1318}13191320struct IDAndRegClass {1321IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}13221323unsigned id;1324RegClass rc;1325};13261327struct IDAndInfo {1328IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}13291330unsigned id;1331DefInfo info;1332};13331334/* Reallocates vars by sorting them and placing each variable after the previous1335* one. If one of the variables has 0xffffffff as an ID, the register assigned1336* for that variable will be returned.1337*/1338PhysReg1339compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,1340std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)1341{1342/* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword1343* temporary sizes to dwords.1344*/1345std::vector<IDAndInfo> sorted;1346for (IDAndRegClass var : vars) {1347DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);1348sorted.emplace_back(var.id, info);1349}13501351std::sort(1352sorted.begin(), sorted.end(),1353[&ctx](const IDAndInfo& a, const IDAndInfo& b)1354{1355unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);1356unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);1357if (a_stride > b_stride)1358return true;1359if (a_stride < b_stride)1360return false;1361if (a.id == 0xffffffff || b.id == 0xffffffff)1362return a.id ==13630xffffffff; /* place 0xffffffff before others if possible, not for any reason */1364return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;1365});13661367PhysReg next_reg = start;1368PhysReg space_reg;1369for (IDAndInfo& var : sorted) {1370unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;1371next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));13721373/* 0xffffffff is a special variable ID used reserve a space for killed1374* operands and definitions.1375*/1376if (var.id != 0xffffffff) {1377if (next_reg != ctx.assignments[var.id].reg) {1378RegClass rc = ctx.assignments[var.id].rc;1379Temp tmp(var.id, rc);13801381Operand pc_op(tmp);1382pc_op.setFixed(ctx.assignments[var.id].reg);1383Definition pc_def(next_reg, rc);1384parallelcopies.emplace_back(pc_op, pc_def);1385}1386} else {1387space_reg = next_reg;1388}13891390adjust_max_used_regs(ctx, var.info.rc, next_reg);13911392next_reg = next_reg.advance(var.info.rc.size() * 4);1393}13941395return space_reg;1396}13971398bool1399is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)1400{1401PhysReg first{512};1402for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {1403Operand op = instr->operands[i + 3];14041405if (ctx.assignments[op.tempId()].assigned) {1406PhysReg reg = ctx.assignments[op.tempId()].reg;14071408if (first.reg() != 512 && reg != first.advance(i * 4))1409return false; /* not at the best position */14101411if ((reg.reg() - 256) < i)1412return false; /* no space for previous operands */14131414first = reg.advance(i * -4);1415} else if (first.reg() != 512) {1416/* If there's an unexpected temporary, this operand is unlikely to be1417* placed in the best position.1418*/1419unsigned id = reg_file.get_id(first.advance(i * 4));1420if (id && id != op.tempId())1421return false;1422}1423}14241425return true;1426}14271428std::pair<PhysReg, bool>1429get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)1430{1431Instruction* vec = ctx.vectors[temp.id()];1432unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;1433unsigned our_offset = 0;1434for (unsigned i = first_operand; i < vec->operands.size(); i++) {1435Operand& op = vec->operands[i];1436if (op.isTemp() && op.tempId() == temp.id())1437break;1438else1439our_offset += op.bytes();1440}14411442if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {1443unsigned their_offset = 0;1444/* check for every operand of the vector1445* - whether the operand is assigned and1446* - we can use the register relative to that operand1447*/1448for (unsigned i = first_operand; i < vec->operands.size(); i++) {1449Operand& op = vec->operands[i];1450if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&1451ctx.assignments[op.tempId()].assigned) {1452PhysReg reg = ctx.assignments[op.tempId()].reg;1453reg.reg_b += (our_offset - their_offset);1454if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))1455return {reg, true};1456}1457their_offset += op.bytes();1458}14591460/* We didn't find a register relative to other vector operands.1461* Try to find new space which fits the whole vector.1462*/1463RegClass vec_rc = RegClass::get(temp.type(), their_offset);1464DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);1465std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);1466PhysReg reg = res.first;1467if (res.second) {1468reg.reg_b += our_offset;1469/* make sure to only use byte offset if the instruction supports it */1470if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))1471return {reg, true};1472}1473}1474return {{}, false};1475}14761477PhysReg1478get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,1479std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,1480int operand_index = -1)1481{1482auto split_vec = ctx.split_vectors.find(temp.id());1483if (split_vec != ctx.split_vectors.end()) {1484unsigned offset = 0;1485for (Definition def : split_vec->second->definitions) {1486auto affinity_it = ctx.affinities.find(def.tempId());1487if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {1488PhysReg reg = ctx.assignments[affinity_it->second].reg;1489reg.reg_b -= offset;1490if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))1491return reg;1492}1493offset += def.bytes();1494}1495}14961497if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&1498ctx.assignments[ctx.affinities[temp.id()]].assigned) {1499PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;1500if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))1501return reg;1502}15031504std::pair<PhysReg, bool> res;15051506if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {1507res = get_reg_vector(ctx, reg_file, temp, instr);1508if (res.second)1509return res.first;1510}15111512DefInfo info(ctx, instr, temp.regClass(), operand_index);15131514if (!ctx.policy.skip_optimistic_path) {1515/* try to find space without live-range splits */1516res = get_reg_simple(ctx, reg_file, info);15171518if (res.second)1519return res.first;1520}15211522/* try to find space with live-range splits */1523res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);15241525if (res.second)1526return res.first;15271528/* try using more registers */15291530/* We should only fail here because keeping under the limit would require1531* too many moves. */1532assert(reg_file.count_zero(info.bounds) >= info.size);15331534if (!increase_register_file(ctx, info.rc.type())) {1535/* fallback algorithm: reallocate all variables at once */1536unsigned def_size = info.rc.size();1537for (Definition def : instr->definitions) {1538if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())1539def_size += def.regClass().size();1540}15411542unsigned killed_op_size = 0;1543for (Operand op : instr->operands) {1544if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())1545killed_op_size += op.regClass().size();1546}15471548const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());15491550/* reallocate passthrough variables and non-killed operands */1551std::vector<IDAndRegClass> vars;1552for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs))1553vars.emplace_back(var.second, ctx.assignments[var.second].rc);1554vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));15551556PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());15571558/* reallocate killed operands */1559std::vector<IDAndRegClass> killed_op_vars;1560for (Operand op : instr->operands) {1561if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())1562killed_op_vars.emplace_back(op.tempId(), op.regClass());1563}1564compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);15651566/* reallocate definitions */1567std::vector<IDAndRegClass> def_vars;1568for (Definition def : instr->definitions) {1569if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())1570def_vars.emplace_back(def.tempId(), def.regClass());1571}1572def_vars.emplace_back(0xffffffff, info.rc);1573return compact_relocate_vars(ctx, def_vars, parallelcopies, space);1574}15751576return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);1577}15781579PhysReg1580get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,1581std::vector<std::pair<Operand, Definition>>& parallelcopies,1582aco_ptr<Instruction>& instr)1583{1584RegClass rc = temp.regClass();1585/* create_vector instructions have different costs w.r.t. register coalescing */1586uint32_t size = rc.size();1587uint32_t bytes = rc.bytes();1588uint32_t stride = get_stride(rc);1589PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());15901591// TODO: improve p_create_vector for sub-dword vectors15921593PhysReg best_pos{0xFFF};1594unsigned num_moves = 0xFF;1595bool best_war_hint = true;15961597/* test for each operand which definition placement causes the least shuffle instructions */1598for (unsigned i = 0, offset = 0; i < instr->operands.size();1599offset += instr->operands[i].bytes(), i++) {1600// TODO: think about, if we can alias live operands on the same register1601if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||1602instr->operands[i].getTemp().type() != rc.type())1603continue;16041605if (offset > instr->operands[i].physReg().reg_b)1606continue;16071608unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;1609if (reg_lower % 4)1610continue;1611PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};1612unsigned k = 0;16131614/* no need to check multiple times */1615if (reg_win.lo() == best_pos)1616continue;16171618/* check borders */1619// TODO: this can be improved */1620if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)1621continue;1622if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&1623reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))1624continue;1625if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&1626reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))1627continue;16281629/* count variables to be moved and check war_hint */1630bool war_hint = false;1631bool linear_vgpr = false;1632for (PhysReg j : reg_win) {1633if (linear_vgpr) {1634break;1635}16361637if (reg_file[j] != 0) {1638if (reg_file[j] == 0xF0000000) {1639PhysReg reg;1640reg.reg_b = j * 4;1641unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;1642for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)1643k += reg_file.test(reg, 1);1644} else {1645k += 4;1646/* we cannot split live ranges of linear vgprs */1647if (ctx.assignments[reg_file[j]].rc & (1 << 6))1648linear_vgpr = true;1649}1650}1651war_hint |= ctx.war_hint[j];1652}1653if (linear_vgpr || (war_hint && !best_war_hint))1654continue;16551656/* count operands in wrong positions */1657for (unsigned j = 0, offset2 = 0; j < instr->operands.size();1658offset2 += instr->operands[j].bytes(), j++) {1659if (j == i || !instr->operands[j].isTemp() ||1660instr->operands[j].getTemp().type() != rc.type())1661continue;1662if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)1663k += instr->operands[j].bytes();1664}1665bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;1666if (k > num_moves || (!aligned && k == num_moves))1667continue;16681669best_pos = reg_win.lo();1670num_moves = k;1671best_war_hint = war_hint;1672}16731674if (num_moves >= bytes)1675return get_reg(ctx, reg_file, temp, parallelcopies, instr);16761677/* re-enable killed operands which are in the wrong position */1678RegisterFile tmp_file(reg_file);1679for (unsigned i = 0, offset = 0; i < instr->operands.size();1680offset += instr->operands[i].bytes(), i++) {1681if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&1682instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)1683tmp_file.fill(instr->operands[i]);1684}16851686/* collect variables to be moved */1687std::set<std::pair<unsigned, unsigned>> vars =1688collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});16891690for (unsigned i = 0, offset = 0; i < instr->operands.size();1691offset += instr->operands[i].bytes(), i++) {1692if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||1693instr->operands[i].getTemp().type() != rc.type())1694continue;1695bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset;1696/* GFX9+: move killed operands which aren't yet at the correct position1697* Moving all killed operands generally leads to more register swaps.1698* This is only done on GFX9+ because of the cheap v_swap instruction.1699*/1700if (ctx.program->chip_class >= GFX9 && !correct_pos) {1701vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());1702tmp_file.clear(instr->operands[i]);1703/* fill operands which are in the correct position to avoid overwriting */1704} else if (correct_pos) {1705tmp_file.fill(instr->operands[i]);1706}1707}1708bool success = false;1709std::vector<std::pair<Operand, Definition>> pc;1710success =1711get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});17121713if (!success) {1714if (!increase_register_file(ctx, temp.type())) {1715/* use the fallback algorithm in get_reg() */1716return get_reg(ctx, reg_file, temp, parallelcopies, instr);1717}1718return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);1719}17201721parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());1722adjust_max_used_regs(ctx, rc, best_pos);17231724return best_pos;1725}17261727void1728handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)1729{1730if (instr->format != Format::PSEUDO)1731return;17321733/* all instructions which use handle_operands() need this information */1734switch (instr->opcode) {1735case aco_opcode::p_extract_vector:1736case aco_opcode::p_create_vector:1737case aco_opcode::p_split_vector:1738case aco_opcode::p_parallelcopy:1739case aco_opcode::p_wqm: break;1740default: return;1741}17421743/* if all definitions are vgpr, no need to care for SCC */1744bool writes_sgpr = false;1745for (Definition& def : instr->definitions) {1746if (def.getTemp().type() == RegType::sgpr) {1747writes_sgpr = true;1748break;1749}1750}1751/* if all operands are constant, no need to care either */1752bool reads_sgpr = false;1753bool reads_subdword = false;1754for (Operand& op : instr->operands) {1755if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {1756reads_sgpr = true;1757break;1758}1759if (op.isTemp() && op.regClass().is_subdword())1760reads_subdword = true;1761}1762bool needs_scratch_reg =1763(writes_sgpr && reads_sgpr) || (ctx.program->chip_class <= GFX7 && reads_subdword);1764if (!needs_scratch_reg)1765return;17661767if (reg_file[scc]) {1768instr->pseudo().tmp_in_scc = true;17691770int reg = ctx.max_used_sgpr;1771for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)1772;1773if (reg < 0) {1774reg = ctx.max_used_sgpr + 1;1775for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)1776;1777if (reg == ctx.program->max_reg_demand.sgpr) {1778assert(reads_subdword && reg_file[m0] == 0);1779reg = m0;1780}1781}17821783adjust_max_used_regs(ctx, s1, reg);1784instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};1785} else {1786instr->pseudo().tmp_in_scc = false;1787}1788}17891790bool1791operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,1792RegClass rc)1793{1794if (instr->operands[idx].isFixed())1795return instr->operands[idx].physReg() == reg;17961797bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||1798instr->opcode == aco_opcode::v_writelane_b32_e64;1799if (chip <= GFX9 && is_writelane && idx <= 1) {1800/* v_writelane_b32 can take two sgprs but only if one is m0. */1801bool is_other_sgpr =1802instr->operands[!idx].isTemp() &&1803(!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);1804if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {1805instr->operands[idx].setFixed(m0);1806return reg == m0;1807}1808}18091810if (reg.byte()) {1811unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);1812if (reg.byte() % stride)1813return false;1814}18151816switch (instr->format) {1817case Format::SMEM:1818return reg != scc && reg != exec &&1819(reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */1820(reg != vcc || (instr->definitions.empty() && idx == 2) ||1821chip >= GFX10); /* sdata can be vcc */1822default:1823// TODO: there are more instructions with restrictions on registers1824return true;1825}1826}18271828void1829get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,1830std::vector<std::pair<Operand, Definition>>& parallelcopy,1831aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)1832{1833/* check if the operand is fixed */1834PhysReg src = ctx.assignments[operand.tempId()].reg;1835PhysReg dst;1836if (operand.isFixed()) {1837assert(operand.physReg() != src);18381839/* check if target reg is blocked, and move away the blocking var */1840if (register_file.test(operand.physReg(), operand.bytes())) {1841PhysRegInterval target{operand.physReg(), operand.size()};18421843RegisterFile tmp_file(register_file);18441845std::set<std::pair<unsigned, unsigned>> blocking_vars =1846collect_vars(ctx, tmp_file, target);18471848tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src1849tmp_file.block(operand.physReg(), operand.regClass());18501851DefInfo info(ctx, instr, operand.regClass(), -1);1852get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,1853PhysRegInterval());1854}1855dst = operand.physReg();18561857} else {1858dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);1859update_renames(1860ctx, register_file, parallelcopy, instr,1861instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops : (UpdateRenames)0);1862}18631864Operand pc_op = operand;1865pc_op.setFixed(src);1866Definition pc_def = Definition(dst, pc_op.regClass());1867parallelcopy.emplace_back(pc_op, pc_def);1868update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);1869}18701871Temp1872read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)1873{1874std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());1875if (it == ctx.renames[block_idx].end())1876return val;1877else1878return it->second;1879}18801881Temp1882handle_live_in(ra_ctx& ctx, Temp val, Block* block)1883{1884std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;1885if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())1886return val;18871888if (preds.size() == 1) {1889/* if the block has only one predecessor, just look there for the name */1890return read_variable(ctx, val, preds[0]);1891}18921893/* there are multiple predecessors and the block is sealed */1894Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));18951896/* get the rename from each predecessor and check if they are the same */1897Temp new_val;1898bool needs_phi = false;1899for (unsigned i = 0; i < preds.size(); i++) {1900ops[i] = read_variable(ctx, val, preds[i]);1901if (i == 0)1902new_val = ops[i];1903else1904needs_phi |= !(new_val == ops[i]);1905}19061907if (needs_phi) {1908/* the variable has been renamed differently in the predecessors: we need to insert a phi */1909aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;1910aco_ptr<Instruction> phi{1911create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};1912new_val = ctx.program->allocateTmp(val.regClass());1913phi->definitions[0] = Definition(new_val);1914for (unsigned i = 0; i < preds.size(); i++) {1915/* update the operands so that it uses the new affinity */1916phi->operands[i] = Operand(ops[i]);1917assert(ctx.assignments[ops[i].id()].assigned);1918phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);1919if (ops[i].regClass() == new_val.regClass())1920ctx.affinities[new_val.id()] = ops[i].id();1921}1922ctx.assignments.emplace_back();1923assert(ctx.assignments.size() == ctx.program->peekAllocationId());1924block->instructions.insert(block->instructions.begin(), std::move(phi));1925}19261927return new_val;1928}19291930void1931handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,1932uint32_t loop_exit_idx)1933{1934Block& loop_header = ctx.program->blocks[loop_header_idx];1935std::unordered_map<unsigned, Temp> renames;19361937/* create phis for variables renamed during the loop */1938for (unsigned t : live_in) {1939Temp val = Temp(t, ctx.program->temp_rc[t]);1940Temp prev = read_variable(ctx, val, loop_header_idx - 1);1941Temp renamed = handle_live_in(ctx, val, &loop_header);1942if (renamed == prev)1943continue;19441945/* insert additional renames at block end, but don't overwrite */1946renames[prev.id()] = renamed;1947ctx.orig_names[renamed.id()] = val;1948for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {1949auto it = ctx.renames[idx].emplace(val.id(), renamed);1950/* if insertion is unsuccessful, update if necessary */1951if (!it.second && it.first->second == prev)1952it.first->second = renamed;1953}19541955/* update loop-carried values of the phi created by handle_live_in() */1956for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {1957Operand& op = loop_header.instructions[0]->operands[i];1958if (op.getTemp() == prev)1959op.setTemp(renamed);1960}19611962/* use the assignment from the loop preheader and fix def reg */1963assignment& var = ctx.assignments[prev.id()];1964ctx.assignments[renamed.id()] = var;1965loop_header.instructions[0]->definitions[0].setFixed(var.reg);1966}19671968/* rename loop carried phi operands */1969for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {1970aco_ptr<Instruction>& phi = loop_header.instructions[i];1971if (!is_phi(phi))1972break;1973const std::vector<unsigned>& preds =1974phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;1975for (unsigned j = 1; j < phi->operands.size(); j++) {1976Operand& op = phi->operands[j];1977if (!op.isTemp())1978continue;19791980/* Find the original name, since this operand might not use the original name if the phi1981* was created after init_reg_file().1982*/1983std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());1984Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();19851986op.setTemp(read_variable(ctx, orig, preds[j]));1987op.setFixed(ctx.assignments[op.tempId()].reg);1988}1989}19901991/* return early if no new phi was created */1992if (renames.empty())1993return;19941995/* propagate new renames through loop */1996for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {1997Block& current = ctx.program->blocks[idx];1998/* rename all uses in this block */1999for (aco_ptr<Instruction>& instr : current.instructions) {2000/* phis are renamed after RA */2001if (idx == loop_header_idx && is_phi(instr))2002continue;20032004for (Operand& op : instr->operands) {2005if (!op.isTemp())2006continue;20072008auto rename = renames.find(op.tempId());2009if (rename != renames.end()) {2010assert(rename->second.id());2011op.setTemp(rename->second);2012}2013}2014}2015}2016}20172018/**2019* This function serves the purpose to correctly initialize the register file2020* at the beginning of a block (before any existing phis).2021* In order to do so, all live-in variables are entered into the RegisterFile.2022* Reg-to-reg moves (renames) from previous blocks are taken into account and2023* the SSA is repaired by inserting corresponding phi-nodes.2024*/2025RegisterFile2026init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)2027{2028if (block.kind & block_kind_loop_exit) {2029uint32_t header = ctx.loop_header.back();2030ctx.loop_header.pop_back();2031handle_loop_phis(ctx, live_out_per_block[header], header, block.index);2032}20332034RegisterFile register_file;2035const IDSet& live_in = live_out_per_block[block.index];2036assert(block.index != 0 || live_in.empty());20372038if (block.kind & block_kind_loop_header) {2039ctx.loop_header.emplace_back(block.index);2040/* already rename phis incoming value */2041for (aco_ptr<Instruction>& instr : block.instructions) {2042if (!is_phi(instr))2043break;2044Operand& operand = instr->operands[0];2045if (operand.isTemp()) {2046operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));2047operand.setFixed(ctx.assignments[operand.tempId()].reg);2048}2049}2050for (unsigned t : live_in) {2051Temp val = Temp(t, ctx.program->temp_rc[t]);2052Temp renamed = read_variable(ctx, val, block.index - 1);2053if (renamed != val)2054ctx.renames[block.index][val.id()] = renamed;2055assignment& var = ctx.assignments[renamed.id()];2056assert(var.assigned);2057register_file.fill(Definition(renamed.id(), var.reg, var.rc));2058}2059} else {2060/* rename phi operands */2061for (aco_ptr<Instruction>& instr : block.instructions) {2062if (!is_phi(instr))2063break;2064const std::vector<unsigned>& preds =2065instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;20662067for (unsigned i = 0; i < instr->operands.size(); i++) {2068Operand& operand = instr->operands[i];2069if (!operand.isTemp())2070continue;2071operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));2072operand.setFixed(ctx.assignments[operand.tempId()].reg);2073}2074}2075for (unsigned t : live_in) {2076Temp val = Temp(t, ctx.program->temp_rc[t]);2077Temp renamed = handle_live_in(ctx, val, &block);2078assignment& var = ctx.assignments[renamed.id()];2079/* due to live-range splits, the live-in might be a phi, now */2080if (var.assigned) {2081register_file.fill(Definition(renamed.id(), var.reg, var.rc));2082}2083if (renamed != val) {2084ctx.renames[block.index].emplace(t, renamed);2085ctx.orig_names[renamed.id()] = val;2086}2087}2088}20892090return register_file;2091}20922093void2094get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)2095{2096std::vector<std::vector<Temp>> phi_ressources;2097std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;20982099for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();2100block_rit++) {2101Block& block = *block_rit;21022103/* first, compute the death points of all live vars within the block */2104IDSet& live = live_out_per_block[block.index];21052106std::vector<aco_ptr<Instruction>>::reverse_iterator rit;2107for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {2108aco_ptr<Instruction>& instr = *rit;2109if (is_phi(instr)) {2110if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {2111live.erase(instr->definitions[0].tempId());2112continue;2113}2114/* collect information about affinity-related temporaries */2115std::vector<Temp> affinity_related;2116/* affinity_related[0] is the last seen affinity-related temp */2117affinity_related.emplace_back(instr->definitions[0].getTemp());2118affinity_related.emplace_back(instr->definitions[0].getTemp());2119for (const Operand& op : instr->operands) {2120if (op.isTemp() && op.isKill() &&2121op.regClass() == instr->definitions[0].regClass()) {2122affinity_related.emplace_back(op.getTemp());2123temp_to_phi_ressources[op.tempId()] = phi_ressources.size();2124}2125}2126phi_ressources.emplace_back(std::move(affinity_related));2127} else {2128/* add vector affinities */2129if (instr->opcode == aco_opcode::p_create_vector) {2130for (const Operand& op : instr->operands) {2131if (op.isTemp() && op.isFirstKill() &&2132op.getTemp().type() == instr->definitions[0].getTemp().type())2133ctx.vectors[op.tempId()] = instr.get();2134}2135} else if (instr->format == Format::MIMG && instr->operands.size() > 4) {2136for (unsigned i = 3; i < instr->operands.size(); i++)2137ctx.vectors[instr->operands[i].tempId()] = instr.get();2138}21392140if (instr->opcode == aco_opcode::p_split_vector &&2141instr->operands[0].isFirstKillBeforeDef())2142ctx.split_vectors[instr->operands[0].tempId()] = instr.get();21432144/* add operands to live variables */2145for (const Operand& op : instr->operands) {2146if (op.isTemp())2147live.insert(op.tempId());2148}2149}21502151/* erase definitions from live */2152for (unsigned i = 0; i < instr->definitions.size(); i++) {2153const Definition& def = instr->definitions[i];2154if (!def.isTemp())2155continue;2156live.erase(def.tempId());2157/* mark last-seen phi operand */2158std::unordered_map<unsigned, unsigned>::iterator it =2159temp_to_phi_ressources.find(def.tempId());2160if (it != temp_to_phi_ressources.end() &&2161def.regClass() == phi_ressources[it->second][0].regClass()) {2162phi_ressources[it->second][0] = def.getTemp();2163/* try to coalesce phi affinities with parallelcopies */2164Operand op = Operand();2165switch (instr->opcode) {2166case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;21672168case aco_opcode::v_interp_p2_f32:2169case aco_opcode::v_writelane_b32:2170case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;21712172case aco_opcode::v_fma_f32:2173case aco_opcode::v_fma_f16:2174case aco_opcode::v_pk_fma_f16:2175if (ctx.program->chip_class < GFX10)2176continue;2177FALLTHROUGH;2178case aco_opcode::v_mad_f32:2179case aco_opcode::v_mad_f16:2180if (instr->usesModifiers())2181continue;2182op = instr->operands[2];2183break;21842185default: continue;2186}21872188if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {2189phi_ressources[it->second].emplace_back(op.getTemp());2190temp_to_phi_ressources[op.tempId()] = it->second;2191}2192}2193}2194}2195}2196/* create affinities */2197for (std::vector<Temp>& vec : phi_ressources) {2198assert(vec.size() > 1);2199for (unsigned i = 1; i < vec.size(); i++)2200if (vec[i].id() != vec[0].id())2201ctx.affinities[vec[i].id()] = vec[0].id();2202}2203}22042205} /* end namespace */22062207void2208register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)2209{2210ra_ctx ctx(program, policy);2211get_affinities(ctx, live_out_per_block);22122213/* state of register file after phis */2214std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());22152216for (Block& block : program->blocks) {2217/* initialize register file */2218RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);2219ctx.war_hint.reset();22202221std::vector<aco_ptr<Instruction>> instructions;2222std::vector<aco_ptr<Instruction>>::iterator instr_it;22232224/* this is a slight adjustment from the paper as we already have phi nodes:2225* We consider them incomplete phis and only handle the definition. */22262227/* look up the affinities */2228for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();2229++instr_it) {2230aco_ptr<Instruction>& phi = *instr_it;2231if (!is_phi(phi))2232break;2233Definition& definition = phi->definitions[0];2234if (definition.isKill() || definition.isFixed())2235continue;22362237if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&2238ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {2239assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc ==2240definition.regClass());2241PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;2242if (reg == scc) {2243/* only use scc if all operands are already placed there */2244bool use_scc =2245std::all_of(phi->operands.begin(), phi->operands.end(),2246[](const Operand& op)2247{ return op.isTemp() && op.isFixed() && op.physReg() == scc; });2248if (!use_scc)2249continue;2250}22512252/* only assign if register is still free */2253if (!register_file.test(reg, definition.bytes())) {2254definition.setFixed(reg);2255register_file.fill(definition);2256ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};2257}2258}2259}22602261/* find registers for phis without affinity or where the register was blocked */2262for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();2263++instr_it) {2264aco_ptr<Instruction>& phi = *instr_it;2265if (!is_phi(phi))2266break;22672268Definition& definition = phi->definitions[0];2269if (definition.isKill())2270continue;22712272if (!definition.isFixed()) {2273std::vector<std::pair<Operand, Definition>> parallelcopy;2274/* try to find a register that is used by at least one operand */2275for (int i = phi->operands.size() - 1; i >= 0; i--) {2276/* by going backwards, we aim to avoid copies in else-blocks */2277const Operand& op = phi->operands[i];2278if (!op.isTemp() || !op.isFixed())2279continue;2280PhysReg reg = op.physReg();2281/* we tried this already on the previous loop */2282if (reg == scc)2283continue;2284if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {2285definition.setFixed(reg);2286break;2287}2288}2289if (!definition.isFixed()) {2290definition.setFixed(2291get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));2292update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);2293}22942295/* process parallelcopy */2296for (std::pair<Operand, Definition> pc : parallelcopy) {2297/* see if it's a copy from a different phi */2298// TODO: prefer moving some previous phis over live-ins2299// TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a2300// problem in practice since they can only be fixed to exec)2301Instruction* prev_phi = NULL;2302std::vector<aco_ptr<Instruction>>::iterator phi_it;2303for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {2304if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())2305prev_phi = phi_it->get();2306}2307phi_it = instr_it;2308while (!prev_phi && is_phi(*++phi_it)) {2309if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())2310prev_phi = phi_it->get();2311}2312if (prev_phi) {2313/* if so, just update that phi's register */2314register_file.clear(prev_phi->definitions[0]);2315prev_phi->definitions[0].setFixed(pc.second.physReg());2316ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),2317pc.second.regClass()};2318register_file.fill(prev_phi->definitions[0]);2319continue;2320}23212322/* rename */2323std::unordered_map<unsigned, Temp>::iterator orig_it =2324ctx.orig_names.find(pc.first.tempId());2325Temp orig = pc.first.getTemp();2326if (orig_it != ctx.orig_names.end())2327orig = orig_it->second;2328else2329ctx.orig_names[pc.second.tempId()] = orig;2330ctx.renames[block.index][orig.id()] = pc.second.getTemp();23312332/* otherwise, this is a live-in and we need to create a new phi2333* to move it in this block's predecessors */2334aco_opcode opcode =2335pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;2336std::vector<unsigned>& preds =2337pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;2338aco_ptr<Instruction> new_phi{2339create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};2340new_phi->definitions[0] = pc.second;2341for (unsigned i = 0; i < preds.size(); i++)2342new_phi->operands[i] = Operand(pc.first);2343instructions.emplace_back(std::move(new_phi));23442345/* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()2346* would re-create this phi later if this is a loop header.2347*/2348live_out_per_block[block.index].erase(orig.id());2349}23502351register_file.fill(definition);2352ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};2353}23542355/* update phi affinities */2356for (const Operand& op : phi->operands) {2357if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())2358ctx.affinities[op.tempId()] = definition.tempId();2359}23602361instructions.emplace_back(std::move(*instr_it));2362}23632364/* fill in sgpr_live_in */2365for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)2366sgpr_live_in[block.index][i] = register_file[PhysReg{i}];2367sgpr_live_in[block.index][127] = register_file[scc];23682369/* Handle all other instructions of the block */2370for (; instr_it != block.instructions.end(); ++instr_it) {2371aco_ptr<Instruction>& instr = *instr_it;23722373/* parallelcopies from p_phi are inserted here which means2374* live ranges of killed operands end here as well */2375if (instr->opcode == aco_opcode::p_logical_end) {2376/* no need to process this instruction any further */2377if (block.logical_succs.size() != 1) {2378instructions.emplace_back(std::move(instr));2379continue;2380}23812382Block& succ = program->blocks[block.logical_succs[0]];2383unsigned idx = 0;2384for (; idx < succ.logical_preds.size(); idx++) {2385if (succ.logical_preds[idx] == block.index)2386break;2387}2388for (aco_ptr<Instruction>& phi : succ.instructions) {2389if (phi->opcode == aco_opcode::p_phi) {2390if (phi->operands[idx].isTemp() &&2391phi->operands[idx].getTemp().type() == RegType::sgpr &&2392phi->operands[idx].isFirstKillBeforeDef()) {2393Definition phi_op(2394read_variable(ctx, phi->operands[idx].getTemp(), block.index));2395phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);2396register_file.clear(phi_op);2397}2398} else if (phi->opcode != aco_opcode::p_linear_phi) {2399break;2400}2401}2402instructions.emplace_back(std::move(instr));2403continue;2404}24052406std::vector<std::pair<Operand, Definition>> parallelcopy;24072408assert(!is_phi(instr));24092410bool temp_in_scc = register_file[scc];24112412/* handle operands */2413for (unsigned i = 0; i < instr->operands.size(); ++i) {2414auto& operand = instr->operands[i];2415if (!operand.isTemp())2416continue;24172418/* rename operands */2419operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));2420assert(ctx.assignments[operand.tempId()].assigned);24212422PhysReg reg = ctx.assignments[operand.tempId()].reg;2423if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))2424operand.setFixed(reg);2425else2426get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);24272428if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||2429(instr->isDS() && instr->ds().gds)) {2430for (unsigned j = 0; j < operand.size(); j++)2431ctx.war_hint.set(operand.physReg().reg() + j);2432}2433}24342435/* remove dead vars from register file */2436for (const Operand& op : instr->operands) {2437if (op.isTemp() && op.isFirstKillBeforeDef())2438register_file.clear(op);2439}24402441/* try to optimize v_mad_f32 -> v_mac_f32 */2442if ((instr->opcode == aco_opcode::v_mad_f32 ||2443(instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||2444instr->opcode == aco_opcode::v_mad_f16 ||2445instr->opcode == aco_opcode::v_mad_legacy_f16 ||2446(instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||2447(instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10)) &&2448instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&2449instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&2450instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&2451instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&2452instr->operands[2].physReg().byte() == 0) {2453unsigned def_id = instr->definitions[0].tempId();2454auto it = ctx.affinities.find(def_id);2455if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||2456instr->operands[2].physReg() == ctx.assignments[it->second].reg ||2457register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {2458instr->format = Format::VOP2;2459switch (instr->opcode) {2460case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;2461case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;2462case aco_opcode::v_mad_f16:2463case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;2464case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;2465case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;2466default: break;2467}2468}2469}24702471/* handle definitions which must have the same register as an operand */2472if (instr->opcode == aco_opcode::v_interp_p2_f32 ||2473instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||2474instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||2475instr->opcode == aco_opcode::v_pk_fmac_f16 ||2476instr->opcode == aco_opcode::v_writelane_b32 ||2477instr->opcode == aco_opcode::v_writelane_b32_e64) {2478instr->definitions[0].setFixed(instr->operands[2].physReg());2479} else if (instr->opcode == aco_opcode::s_addk_i32 ||2480instr->opcode == aco_opcode::s_mulk_i32) {2481instr->definitions[0].setFixed(instr->operands[0].physReg());2482} else if (instr->isMUBUF() && instr->definitions.size() == 1 &&2483instr->operands.size() == 4) {2484instr->definitions[0].setFixed(instr->operands[3].physReg());2485} else if (instr->isMIMG() && instr->definitions.size() == 1 &&2486!instr->operands[2].isUndefined()) {2487instr->definitions[0].setFixed(instr->operands[2].physReg());2488}24892490ctx.defs_done.reset();24912492/* handle fixed definitions first */2493for (unsigned i = 0; i < instr->definitions.size(); ++i) {2494auto& definition = instr->definitions[i];2495if (!definition.isFixed())2496continue;24972498adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());2499/* check if the target register is blocked */2500if (register_file.test(definition.physReg(), definition.bytes())) {2501const PhysRegInterval def_regs{definition.physReg(), definition.size()};25022503/* create parallelcopy pair to move blocking vars */2504std::set<std::pair<unsigned, unsigned>> vars =2505collect_vars(ctx, register_file, def_regs);25062507RegisterFile tmp_file(register_file);2508/* re-enable the killed operands, so that we don't move the blocking vars there */2509for (const Operand& op : instr->operands) {2510if (op.isTemp() && op.isFirstKillBeforeDef())2511tmp_file.fill(op);2512}25132514ASSERTED bool success = false;2515DefInfo info(ctx, instr, definition.regClass(), -1);2516success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,2517def_regs);2518assert(success);25192520update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);2521}2522ctx.defs_done.set(i);25232524if (!definition.isTemp())2525continue;25262527ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};2528register_file.fill(definition);2529}25302531/* handle all other definitions */2532for (unsigned i = 0; i < instr->definitions.size(); ++i) {2533Definition* definition = &instr->definitions[i];25342535if (definition->isFixed() || !definition->isTemp())2536continue;25372538/* find free reg */2539if (definition->hasHint() &&2540get_reg_specified(ctx, register_file, definition->regClass(), instr,2541definition->physReg())) {2542definition->setFixed(definition->physReg());2543} else if (instr->opcode == aco_opcode::p_split_vector) {2544PhysReg reg = instr->operands[0].physReg();2545for (unsigned j = 0; j < i; j++)2546reg.reg_b += instr->definitions[j].bytes();2547if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))2548definition->setFixed(reg);2549} else if (instr->opcode == aco_opcode::p_wqm ||2550instr->opcode == aco_opcode::p_parallelcopy) {2551PhysReg reg = instr->operands[i].physReg();2552if (instr->operands[i].isTemp() &&2553instr->operands[i].getTemp().type() == definition->getTemp().type() &&2554!register_file.test(reg, definition->bytes()))2555definition->setFixed(reg);2556} else if (instr->opcode == aco_opcode::p_extract_vector) {2557PhysReg reg = instr->operands[0].physReg();2558reg.reg_b += definition->bytes() * instr->operands[1].constantValue();2559if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))2560definition->setFixed(reg);2561} else if (instr->opcode == aco_opcode::p_create_vector) {2562PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),2563parallelcopy, instr);2564update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);2565definition->setFixed(reg);2566}25672568if (!definition->isFixed()) {2569Temp tmp = definition->getTemp();2570if (definition->regClass().is_subdword() && definition->bytes() < 4) {2571PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);2572definition->setFixed(reg);2573if (reg.byte() || register_file.test(reg, 4)) {2574add_subdword_definition(program, instr, i, reg);2575definition = &instr->definitions[i]; /* add_subdword_definition can invalidate2576the reference */2577}2578} else {2579definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));2580}2581update_renames(ctx, register_file, parallelcopy, instr,2582instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops2583: (UpdateRenames)0);2584}25852586assert(2587definition->isFixed() &&2588((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||2589(definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));2590ctx.defs_done.set(i);2591ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};2592register_file.fill(*definition);2593}25942595handle_pseudo(ctx, register_file, instr.get());25962597/* kill definitions and late-kill operands and ensure that sub-dword operands can actually2598* be read */2599for (const Definition& def : instr->definitions) {2600if (def.isTemp() && def.isKill())2601register_file.clear(def);2602}2603for (unsigned i = 0; i < instr->operands.size(); i++) {2604const Operand& op = instr->operands[i];2605if (op.isTemp() && op.isFirstKill() && op.isLateKill())2606register_file.clear(op);2607if (op.isTemp() && op.physReg().byte() != 0)2608add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());2609}26102611/* emit parallelcopy */2612if (!parallelcopy.empty()) {2613aco_ptr<Pseudo_instruction> pc;2614pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,2615Format::PSEUDO, parallelcopy.size(),2616parallelcopy.size()));2617bool sgpr_operands_alias_defs = false;2618uint64_t sgpr_operands[4] = {0, 0, 0, 0};2619for (unsigned i = 0; i < parallelcopy.size(); i++) {2620if (temp_in_scc && parallelcopy[i].first.isTemp() &&2621parallelcopy[i].first.getTemp().type() == RegType::sgpr) {2622if (!sgpr_operands_alias_defs) {2623unsigned reg = parallelcopy[i].first.physReg().reg();2624unsigned size = parallelcopy[i].first.getTemp().size();2625sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);26262627reg = parallelcopy[i].second.physReg().reg();2628size = parallelcopy[i].second.getTemp().size();2629if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))2630sgpr_operands_alias_defs = true;2631}2632}26332634pc->operands[i] = parallelcopy[i].first;2635pc->definitions[i] = parallelcopy[i].second;2636assert(pc->operands[i].size() == pc->definitions[i].size());26372638/* it might happen that the operand is already renamed. we have to restore the2639* original name. */2640std::unordered_map<unsigned, Temp>::iterator it =2641ctx.orig_names.find(pc->operands[i].tempId());2642Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();2643ctx.orig_names[pc->definitions[i].tempId()] = orig;2644ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();2645}26462647if (temp_in_scc && sgpr_operands_alias_defs) {2648/* disable definitions and re-enable operands */2649RegisterFile tmp_file(register_file);2650for (const Definition& def : instr->definitions) {2651if (def.isTemp() && !def.isKill())2652tmp_file.clear(def);2653}2654for (const Operand& op : instr->operands) {2655if (op.isTemp() && op.isFirstKill())2656tmp_file.block(op.physReg(), op.regClass());2657}26582659handle_pseudo(ctx, tmp_file, pc.get());2660} else {2661pc->tmp_in_scc = false;2662}26632664instructions.emplace_back(std::move(pc));2665}26662667/* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */2668bool instr_needs_vop3 =2669!instr->isVOP3() &&2670((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||2671(instr->opcode == aco_opcode::v_cndmask_b32 &&2672!(instr->operands[2].physReg() == vcc)) ||2673((instr->opcode == aco_opcode::v_add_co_u32 ||2674instr->opcode == aco_opcode::v_addc_co_u32 ||2675instr->opcode == aco_opcode::v_sub_co_u32 ||2676instr->opcode == aco_opcode::v_subb_co_u32 ||2677instr->opcode == aco_opcode::v_subrev_co_u32 ||2678instr->opcode == aco_opcode::v_subbrev_co_u32) &&2679!(instr->definitions[1].physReg() == vcc)) ||2680((instr->opcode == aco_opcode::v_addc_co_u32 ||2681instr->opcode == aco_opcode::v_subb_co_u32 ||2682instr->opcode == aco_opcode::v_subbrev_co_u32) &&2683!(instr->operands[2].physReg() == vcc)));2684if (instr_needs_vop3) {26852686/* if the first operand is a literal, we have to move it to a reg */2687if (instr->operands.size() && instr->operands[0].isLiteral() &&2688program->chip_class < GFX10) {2689bool can_sgpr = true;2690/* check, if we have to move to vgpr */2691for (const Operand& op : instr->operands) {2692if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {2693can_sgpr = false;2694break;2695}2696}2697/* disable definitions and re-enable operands */2698RegisterFile tmp_file(register_file);2699for (const Definition& def : instr->definitions)2700tmp_file.clear(def);2701for (const Operand& op : instr->operands) {2702if (op.isTemp() && op.isFirstKill())2703tmp_file.block(op.physReg(), op.regClass());2704}2705Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);2706ctx.assignments.emplace_back();2707PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);2708update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);27092710aco_ptr<Instruction> mov;2711if (can_sgpr)2712mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,2713Format::SOP1, 1, 1));2714else2715mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,2716Format::VOP1, 1, 1));2717mov->operands[0] = instr->operands[0];2718mov->definitions[0] = Definition(tmp);2719mov->definitions[0].setFixed(reg);27202721instr->operands[0] = Operand(tmp);2722instr->operands[0].setFixed(reg);2723instr->operands[0].setFirstKill(true);27242725instructions.emplace_back(std::move(mov));2726}27272728/* change the instruction to VOP3 to enable an arbitrary register pair as dst */2729aco_ptr<Instruction> tmp = std::move(instr);2730Format format = asVOP3(tmp->format);2731instr.reset(create_instruction<VOP3_instruction>(2732tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));2733std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());2734std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());2735}27362737instructions.emplace_back(std::move(*instr_it));27382739} /* end for Instr */27402741block.instructions = std::move(instructions);2742} /* end for BB */27432744/* find scc spill registers which may be needed for parallelcopies created by phis */2745for (Block& block : program->blocks) {2746if (block.linear_preds.size() <= 1)2747continue;27482749std::bitset<128> regs = sgpr_live_in[block.index];2750if (!regs[127])2751continue;27522753/* choose a register */2754int16_t reg = 0;2755for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)2756;2757assert(reg < ctx.program->max_reg_demand.sgpr);2758adjust_max_used_regs(ctx, s1, reg);27592760/* update predecessors */2761for (unsigned& pred_index : block.linear_preds) {2762Block& pred = program->blocks[pred_index];2763pred.scc_live_out = true;2764pred.scratch_sgpr = PhysReg{(uint16_t)reg};2765}2766}27672768/* num_gpr = rnd_up(max_used_gpr + 1) */2769program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);2770program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);27712772program->progress = CompilationProgress::after_ra;2773}27742775} // namespace aco277627772778