Path: blob/21.2-virgl/src/intel/compiler/brw_fs_bank_conflicts.cpp
4550 views
/*1* Copyright © 2017 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/** @file brw_fs_bank_conflicts.cpp24*25* This file contains a GRF bank conflict mitigation pass. The pass is26* intended to be run after register allocation and works by rearranging the27* layout of the GRF space (without altering the semantics of the program) in28* a way that minimizes the number of GRF bank conflicts incurred by ternary29* instructions.30*31* Unfortunately there is close to no information about bank conflicts in the32* hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to33* incur an average bank conflict penalty of one cycle per SIMD8 op whenever34* the second and third source are stored in the same GRF bank (\sa bank_of()35* for the exact bank layout) which cannot be fetched during the same cycle by36* the EU, unless the EU logic manages to optimize out the read cycle of a37* duplicate source register (\sa is_conflict_optimized_out()).38*39* The asymptotic run-time of the algorithm is dominated by the40* shader_conflict_weight_matrix() computation below, which is O(n) on the41* number of instructions in the program, however for small and medium-sized42* programs the run-time is likely to be dominated by43* optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of44* the program (\sa partitioning), which is bounded (since the program uses a45* bounded number of registers post-regalloc) and of the order of 100. For46* that reason optimize_reg_permutation() is vectorized in order to keep the47* cubic term within reasonable bounds for m close to its theoretical maximum.48*/4950#include "brw_fs.h"51#include "brw_cfg.h"5253#ifdef __SSE2__5455#include <emmintrin.h>5657/**58* Thin layer around vector intrinsics so they can be easily replaced with59* e.g. the fall-back scalar path, an implementation with different vector60* width or using different SIMD architectures (AVX-512?!).61*62* This implementation operates on pairs of independent SSE2 integer vectors à63* la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually64* all platforms that care about bank conflicts, so this path should almost65* always be available in practice.66*/67namespace {68/**69* SIMD integer vector data type.70*/71struct vector_type {72__m128i v[2];73};7475/**76* Scalar data type matching the representation of a single component of \p77* vector_type.78*/79typedef int16_t scalar_type;8081/**82* Maximum integer value representable as a \p scalar_type.83*/84const scalar_type max_scalar = INT16_MAX;8586/**87* Number of components of a \p vector_type.88*/89const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);9091/**92* Set the i-th component of vector \p v to \p x.93*/94void95set(vector_type &v, unsigned i, scalar_type x)96{97assert(i < vector_width);98memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));99}100101/**102* Get the i-th component of vector \p v.103*/104scalar_type105get(const vector_type &v, unsigned i)106{107assert(i < vector_width);108scalar_type x;109memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));110return x;111}112113/**114* Add two vectors with saturation.115*/116vector_type117adds(const vector_type &v, const vector_type &w)118{119const vector_type u = {{120_mm_adds_epi16(v.v[0], w.v[0]),121_mm_adds_epi16(v.v[1], w.v[1])122}};123return u;124}125126/**127* Subtract two vectors with saturation.128*/129vector_type130subs(const vector_type &v, const vector_type &w)131{132const vector_type u = {{133_mm_subs_epi16(v.v[0], w.v[0]),134_mm_subs_epi16(v.v[1], w.v[1])135}};136return u;137}138139/**140* Compute the bitwise conjunction of two vectors.141*/142vector_type143mask(const vector_type &v, const vector_type &w)144{145const vector_type u = {{146_mm_and_si128(v.v[0], w.v[0]),147_mm_and_si128(v.v[1], w.v[1])148}};149return u;150}151152/**153* Reduce the components of a vector using saturating addition.154*/155scalar_type156sums(const vector_type &v)157{158const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);159const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));160const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));161const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));162return _mm_extract_epi16(v1, 0);163}164}165166#else167168/**169* Thin layer around vector intrinsics so they can be easily replaced with170* e.g. the fall-back scalar path, an implementation with different vector171* width or using different SIMD architectures (AVX-512?!).172*173* This implementation operates on scalar values and doesn't rely on174* any vector extensions. This is mainly intended for debugging and175* to keep this file building on exotic platforms.176*/177namespace {178/**179* SIMD integer vector data type.180*/181typedef int16_t vector_type;182183/**184* Scalar data type matching the representation of a single component of \p185* vector_type.186*/187typedef int16_t scalar_type;188189/**190* Maximum integer value representable as a \p scalar_type.191*/192const scalar_type max_scalar = INT16_MAX;193194/**195* Number of components of a \p vector_type.196*/197const unsigned vector_width = 1;198199/**200* Set the i-th component of vector \p v to \p x.201*/202void203set(vector_type &v, unsigned i, scalar_type x)204{205assert(i < vector_width);206v = x;207}208209/**210* Get the i-th component of vector \p v.211*/212scalar_type213get(const vector_type &v, unsigned i)214{215assert(i < vector_width);216return v;217}218219/**220* Add two vectors with saturation.221*/222vector_type223adds(vector_type v, vector_type w)224{225return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));226}227228/**229* Substract two vectors with saturation.230*/231vector_type232subs(vector_type v, vector_type w)233{234return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));235}236237/**238* Compute the bitwise conjunction of two vectors.239*/240vector_type241mask(vector_type v, vector_type w)242{243return v & w;244}245246/**247* Reduce the components of a vector using saturating addition.248*/249scalar_type250sums(vector_type v)251{252return v;253}254}255256#endif257258/**259* Swap \p x and \p y.260*/261#define SWAP(x, y) do { \262__typeof(y) _swap_tmp = y; \263y = x; \264x = _swap_tmp; \265} while (0)266267namespace {268/**269* Variable-length vector type intended to represent cycle-count costs for270* arbitrary atom-to-bank assignments. It's indexed by a pair of integers271* (i, p), where i is an atom index and p in {0, 1} indicates the parity of272* the conflict (respectively, whether the cost is incurred whenever the273* atoms are assigned the same bank b or opposite-parity banks b and b^1).274* \sa shader_conflict_weight_matrix()275*/276struct weight_vector_type {277weight_vector_type() : v(NULL), size(0) {}278279weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}280281weight_vector_type(const weight_vector_type &u) :282v(alloc(u.size)), size(u.size)283{284memcpy(v, u.v,285DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));286}287288~weight_vector_type()289{290free(v);291}292293weight_vector_type &294operator=(weight_vector_type u)295{296SWAP(v, u.v);297SWAP(size, u.size);298return *this;299}300301vector_type *v;302unsigned size;303304private:305static vector_type *306alloc(unsigned n)307{308const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));309const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);310void *p;311if (posix_memalign(&p, align, size))312return NULL;313memset(p, 0, size);314return reinterpret_cast<vector_type *>(p);315}316};317318/**319* Set the (i, p)-th component of weight vector \p v to \p x.320*/321void322set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)323{324set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);325}326327/**328* Get the (i, p)-th component of weight vector \p v.329*/330scalar_type331get(const weight_vector_type &v, unsigned i, unsigned p)332{333return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);334}335336/**337* Swap the (i, p)-th and (j, q)-th components of weight vector \p v.338*/339void340swap(weight_vector_type &v,341unsigned i, unsigned p,342unsigned j, unsigned q)343{344const scalar_type tmp = get(v, i, p);345set(v, i, p, get(v, j, q));346set(v, j, q, tmp);347}348}349350namespace {351/**352* Object that represents the partitioning of an arbitrary register space353* into indivisible units (referred to as atoms below) that can potentially354* be rearranged independently from other registers. The partitioning is355* inferred from a number of contiguity requirements specified using356* require_contiguous(). This allows efficient look-up of the atom index a357* given register address belongs to, or conversely the range of register358* addresses that belong to a given atom.359*/360struct partitioning {361/**362* Create a (for the moment unrestricted) partitioning of a register363* file of size \p n. The units are arbitrary.364*/365partitioning(unsigned n) :366max_reg(n),367offsets(new unsigned[n + num_terminator_atoms]),368atoms(new unsigned[n + num_terminator_atoms])369{370for (unsigned i = 0; i < n + num_terminator_atoms; i++) {371offsets[i] = i;372atoms[i] = i;373}374}375376partitioning(const partitioning &p) :377max_reg(p.max_reg),378offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),379atoms(new unsigned[p.max_reg + num_terminator_atoms])380{381memcpy(offsets, p.offsets,382sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));383memcpy(atoms, p.atoms,384sizeof(unsigned) * (p.max_reg + num_terminator_atoms));385}386387~partitioning()388{389delete[] offsets;390delete[] atoms;391}392393partitioning &394operator=(partitioning p)395{396SWAP(max_reg, p.max_reg);397SWAP(offsets, p.offsets);398SWAP(atoms, p.atoms);399return *this;400}401402/**403* Require register range [reg, reg + n[ to be considered part of the404* same atom.405*/406void407require_contiguous(unsigned reg, unsigned n)408{409unsigned r = atoms[reg];410411/* Renumber atoms[reg...] = { r... } and their offsets[r...] for the412* case that the specified contiguity requirement leads to the fusion413* (yay) of one or more existing atoms.414*/415for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {416if (offsets[atoms[reg1]] < reg + n) {417atoms[reg1] = r;418} else {419if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])420r++;421422offsets[r] = offsets[atoms[reg1]];423atoms[reg1] = r;424}425}426}427428/**429* Get the atom index register address \p reg belongs to.430*/431unsigned432atom_of_reg(unsigned reg) const433{434return atoms[reg];435}436437/**438* Get the base register address that belongs to atom \p r.439*/440unsigned441reg_of_atom(unsigned r) const442{443return offsets[r];444}445446/**447* Get the size of atom \p r in register address units.448*/449unsigned450size_of_atom(unsigned r) const451{452assert(r < num_atoms());453return reg_of_atom(r + 1) - reg_of_atom(r);454}455456/**457* Get the number of atoms the whole register space is partitioned into.458*/459unsigned460num_atoms() const461{462return atoms[max_reg];463}464465private:466/**467* Number of trailing atoms inserted for convenience so among other468* things we don't need to special-case the last element in469* size_of_atom().470*/471static const unsigned num_terminator_atoms = 1;472unsigned max_reg;473unsigned *offsets;474unsigned *atoms;475};476477/**478* Only GRF sources (whether they have been register-allocated or not) can479* possibly incur bank conflicts.480*/481bool482is_grf(const fs_reg &r)483{484return r.file == VGRF || r.file == FIXED_GRF;485}486487/**488* Register offset of \p r in GRF units. Useful because the representation489* of GRFs post-register allocation is somewhat inconsistent and depends on490* whether the register already had a fixed GRF offset prior to register491* allocation or whether it was part of a VGRF allocation.492*/493unsigned494reg_of(const fs_reg &r)495{496assert(is_grf(r));497if (r.file == VGRF)498return r.nr + r.offset / REG_SIZE;499else500return reg_offset(r) / REG_SIZE;501}502503/**504* Calculate the finest partitioning of the GRF space compatible with the505* register contiguity requirements derived from all instructions part of506* the program.507*/508partitioning509shader_reg_partitioning(const fs_visitor *v)510{511partitioning p(BRW_MAX_GRF);512513foreach_block_and_inst(block, fs_inst, inst, v->cfg) {514if (is_grf(inst->dst))515p.require_contiguous(reg_of(inst->dst), regs_written(inst));516517for (int i = 0; i < inst->sources; i++) {518if (is_grf(inst->src[i]))519p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));520}521}522523return p;524}525526/**527* Return the set of GRF atoms that should be left untouched at their528* original location to avoid violating hardware or software assumptions.529*/530bool *531shader_reg_constraints(const fs_visitor *v, const partitioning &p)532{533bool *constrained = new bool[p.num_atoms()]();534535/* These are read implicitly by some send-message instructions without536* any indication at the IR level. Assume they are unsafe to move537* around.538*/539for (unsigned reg = 0; reg < 2; reg++)540constrained[p.atom_of_reg(reg)] = true;541542/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",543* subsection "EUISA Instructions", Send Message (page 990):544*545* "r127 must not be used for return address when there is a src and546* dest overlap in send instruction."547*548* Register allocation ensures that, so don't move 127 around to avoid549* breaking that property.550*/551if (v->devinfo->ver >= 8)552constrained[p.atom_of_reg(127)] = true;553554foreach_block_and_inst(block, fs_inst, inst, v->cfg) {555/* Assume that anything referenced via fixed GRFs is baked into the556* hardware's fixed-function logic and may be unsafe to move around.557* Also take into account the source GRF restrictions of EOT558* send-message instructions.559*/560if (inst->dst.file == FIXED_GRF)561constrained[p.atom_of_reg(reg_of(inst->dst))] = true;562563for (int i = 0; i < inst->sources; i++) {564if (inst->src[i].file == FIXED_GRF ||565(is_grf(inst->src[i]) && inst->eot))566constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;567}568569/* Preserve the original allocation of VGRFs used by the barycentric570* source of the LINTERP instruction on Gfx6, since pair-aligned571* barycentrics allow the PLN instruction to be used.572*/573if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&574inst->opcode == FS_OPCODE_LINTERP)575constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;576577/* The location of the Gfx7 MRF hack registers is hard-coded in the578* rest of the compiler back-end. Don't attempt to move them around.579*/580if (v->devinfo->ver >= 7) {581assert(inst->dst.file != MRF);582583for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {584const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;585constrained[p.atom_of_reg(reg)] = true;586}587}588}589590return constrained;591}592593/**594* Return whether the hardware will be able to prevent a bank conflict by595* optimizing out the read cycle of a source register. The formula was596* found experimentally.597*/598bool599is_conflict_optimized_out(const intel_device_info *devinfo,600const fs_inst *inst)601{602return devinfo->ver >= 9 &&603((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||604reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||605reg_of(inst->src[1]) == reg_of(inst->src[2]));606}607608/**609* Return a matrix that allows reasonably efficient computation of the610* cycle-count cost of bank conflicts incurred throughout the whole program611* for any given atom-to-bank assignment.612*613* More precisely, if C_r_s_p is the result of this function, the total614* cost of all bank conflicts involving any given atom r can be readily615* recovered as follows:616*617* S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)618*619* where d_i_j is the Kronecker delta, and B_r indicates the bank620* assignment of r. \sa delta_conflicts() for a vectorized implementation621* of the expression above.622*623* FINISHME: Teach this about the Gfx10+ bank conflict rules, which are624* somewhat more relaxed than on previous generations. In the625* meantime optimizing based on Gfx9 weights is likely to be more626* helpful than not optimizing at all.627*/628weight_vector_type *629shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)630{631weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];632for (unsigned r = 0; r < p.num_atoms(); r++)633conflicts[r] = weight_vector_type(2 * p.num_atoms());634635/* Crude approximation of the number of times the current basic block636* will be executed at run-time.637*/638unsigned block_scale = 1;639640foreach_block_and_inst(block, fs_inst, inst, v->cfg) {641if (inst->opcode == BRW_OPCODE_DO) {642block_scale *= 10;643644} else if (inst->opcode == BRW_OPCODE_WHILE) {645block_scale /= 10;646647} else if (inst->is_3src(v->devinfo) &&648is_grf(inst->src[1]) && is_grf(inst->src[2])) {649const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));650const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));651652/* Estimate of the cycle-count cost of incurring a bank conflict653* for this instruction. This is only true on the average, for a654* sequence of back-to-back ternary instructions, since the EU655* front-end only seems to be able to issue a new instruction at656* an even cycle. The cost of a bank conflict incurred by an657* isolated ternary instruction may be higher.658*/659const unsigned exec_size = inst->dst.component_size(inst->exec_size);660const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,661REG_SIZE);662663/* Neglect same-atom conflicts (since they're either trivial or664* impossible to avoid without splitting the atom), and conflicts665* known to be optimized out by the hardware.666*/667if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {668/* Calculate the parity of the sources relative to the start of669* their respective atoms. If their parity is the same (and670* none of the atoms straddle the 2KB mark), the instruction671* will incur a conflict iff both atoms are assigned the same672* bank b. If their parity is opposite, the instruction will673* incur a conflict iff they are assigned opposite banks (b and674* b^1).675*/676const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));677const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));678const unsigned p = p_r ^ p_s;679680/* Calculate the updated cost of a hypothetical conflict681* between atoms r and s. Note that the weight matrix is682* symmetric with respect to indices r and s by construction.683*/684const scalar_type w = MIN2(unsigned(max_scalar),685get(conflicts[r], s, p) + cycle_scale);686set(conflicts[r], s, p, w);687set(conflicts[s], r, p, w);688}689}690}691692return conflicts;693}694695/**696* Return the set of GRF atoms that could potentially lead to bank697* conflicts if laid out unfavorably in the GRF space according to698* the specified \p conflicts matrix (\sa699* shader_conflict_weight_matrix()).700*/701bool *702have_any_conflicts(const partitioning &p,703const weight_vector_type *conflicts)704{705bool *any_conflicts = new bool[p.num_atoms()]();706707for (unsigned r = 0; r < p.num_atoms(); r++) {708const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);709for (unsigned s = 0; s < m; s++)710any_conflicts[r] |= sums(conflicts[r].v[s]);711}712713return any_conflicts;714}715716/**717* Calculate the difference between two S(B) cost estimates as defined718* above (\sa shader_conflict_weight_matrix()). This represents the719* (partial) cycle-count benefit from moving an atom r from bank p to n.720* The respective bank assignments Bp and Bn are encoded as the \p721* bank_mask_p and \p bank_mask_n bitmasks for efficient computation,722* according to the formula:723*724* bank_mask(B)_s_p = -d_(p^B_r)_(B_s)725*726* Notice the similarity with the delta function in the S(B) expression727* above, and how bank_mask(B) can be precomputed for every possible728* selection of r since bank_mask(B) only depends on it via B_r that may729* only assume one of four different values, so the caller can keep every730* possible bank_mask(B) vector in memory without much hassle (\sa731* bank_characteristics()).732*/733int734delta_conflicts(const weight_vector_type &bank_mask_p,735const weight_vector_type &bank_mask_n,736const weight_vector_type &conflicts)737{738const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);739vector_type s_p = {}, s_n = {};740741for (unsigned r = 0; r < m; r++) {742s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));743s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));744}745746return sums(subs(s_p, s_n));747}748749/**750* Register atom permutation, represented as the start GRF offset each atom751* is mapped into.752*/753struct permutation {754permutation() : v(NULL), size(0) {}755756permutation(unsigned n) :757v(new unsigned[n]()), size(n) {}758759permutation(const permutation &p) :760v(new unsigned[p.size]), size(p.size)761{762memcpy(v, p.v, p.size * sizeof(unsigned));763}764765~permutation()766{767delete[] v;768}769770permutation &771operator=(permutation p)772{773SWAP(v, p.v);774SWAP(size, p.size);775return *this;776}777778unsigned *v;779unsigned size;780};781782/**783* Return an identity permutation of GRF atoms.784*/785permutation786identity_reg_permutation(const partitioning &p)787{788permutation map(p.num_atoms());789790for (unsigned r = 0; r < map.size; r++)791map.v[r] = p.reg_of_atom(r);792793return map;794}795796/**797* Return the bank index of GRF address \p reg, numbered according to the798* table:799* Even Odd800* Lo 0 1801* Hi 2 3802*/803unsigned804bank_of(unsigned reg)805{806return (reg & 0x40) >> 5 | (reg & 1);807}808809/**810* Return bitmasks suitable for use as bank mask arguments for the811* delta_conflicts() computation. Note that this is just the (negative)812* characteristic function of each bank, if you regard it as a set813* containing all atoms assigned to it according to the \p map array.814*/815weight_vector_type *816bank_characteristics(const permutation &map)817{818weight_vector_type *banks = new weight_vector_type[4];819820for (unsigned b = 0; b < 4; b++) {821banks[b] = weight_vector_type(2 * map.size);822823for (unsigned j = 0; j < map.size; j++) {824for (unsigned p = 0; p < 2; p++)825set(banks[b], j, p,826(b ^ p) == bank_of(map.v[j]) ? -1 : 0);827}828}829830return banks;831}832833/**834* Return an improved permutation of GRF atoms based on \p map attempting835* to reduce the total cycle-count cost of bank conflicts greedily.836*837* Note that this doesn't attempt to merge multiple atoms into one, which838* may allow it to do a better job in some cases -- It simply reorders839* existing atoms in the GRF space without affecting their identity.840*/841permutation842optimize_reg_permutation(const partitioning &p,843const bool *constrained,844const weight_vector_type *conflicts,845permutation map)846{847const bool *any_conflicts = have_any_conflicts(p, conflicts);848weight_vector_type *banks = bank_characteristics(map);849850for (unsigned r = 0; r < map.size; r++) {851const unsigned bank_r = bank_of(map.v[r]);852853if (!constrained[r]) {854unsigned best_s = r;855int best_benefit = 0;856857for (unsigned s = 0; s < map.size; s++) {858const unsigned bank_s = bank_of(map.v[s]);859860if (bank_r != bank_s && !constrained[s] &&861p.size_of_atom(r) == p.size_of_atom(s) &&862(any_conflicts[r] || any_conflicts[s])) {863const int benefit =864delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +865delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);866867if (benefit > best_benefit) {868best_s = s;869best_benefit = benefit;870}871}872}873874if (best_s != r) {875for (unsigned b = 0; b < 4; b++) {876for (unsigned p = 0; p < 2; p++)877swap(banks[b], r, p, best_s, p);878}879880SWAP(map.v[r], map.v[best_s]);881}882}883}884885delete[] banks;886delete[] any_conflicts;887return map;888}889890/**891* Apply the GRF atom permutation given by \p map to register \p r and892* return the result.893*/894fs_reg895transform(const partitioning &p, const permutation &map, fs_reg r)896{897if (r.file == VGRF) {898const unsigned reg = reg_of(r);899const unsigned s = p.atom_of_reg(reg);900r.nr = map.v[s] + reg - p.reg_of_atom(s);901r.offset = r.offset % REG_SIZE;902}903904return r;905}906}907908bool909fs_visitor::opt_bank_conflicts()910{911assert(grf_used || !"Must be called after register allocation");912913/* No ternary instructions -- No bank conflicts. */914if (devinfo->ver < 6)915return false;916917const partitioning p = shader_reg_partitioning(this);918const bool *constrained = shader_reg_constraints(this, p);919const weight_vector_type *conflicts =920shader_conflict_weight_matrix(this, p);921const permutation map =922optimize_reg_permutation(p, constrained, conflicts,923identity_reg_permutation(p));924925foreach_block_and_inst(block, fs_inst, inst, cfg) {926inst->dst = transform(p, map, inst->dst);927928for (int i = 0; i < inst->sources; i++)929inst->src[i] = transform(p, map, inst->src[i]);930}931932delete[] conflicts;933delete[] constrained;934return true;935}936937/**938* Return whether the instruction incurs GRF bank conflict cycles.939*940* Note that this is only accurate after register allocation because otherwise941* we don't know which bank each VGRF is going to end up aligned to.942*/943bool944has_bank_conflict(const intel_device_info *devinfo, const fs_inst *inst)945{946return inst->is_3src(devinfo) &&947is_grf(inst->src[1]) && is_grf(inst->src[2]) &&948bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&949!is_conflict_optimized_out(devinfo, inst);950}951952953