Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_bank_conflicts.cpp
4550 views
1
/*
2
* Copyright © 2017 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file brw_fs_bank_conflicts.cpp
25
*
26
* This file contains a GRF bank conflict mitigation pass. The pass is
27
* intended to be run after register allocation and works by rearranging the
28
* layout of the GRF space (without altering the semantics of the program) in
29
* a way that minimizes the number of GRF bank conflicts incurred by ternary
30
* instructions.
31
*
32
* Unfortunately there is close to no information about bank conflicts in the
33
* hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
34
* incur an average bank conflict penalty of one cycle per SIMD8 op whenever
35
* the second and third source are stored in the same GRF bank (\sa bank_of()
36
* for the exact bank layout) which cannot be fetched during the same cycle by
37
* the EU, unless the EU logic manages to optimize out the read cycle of a
38
* duplicate source register (\sa is_conflict_optimized_out()).
39
*
40
* The asymptotic run-time of the algorithm is dominated by the
41
* shader_conflict_weight_matrix() computation below, which is O(n) on the
42
* number of instructions in the program, however for small and medium-sized
43
* programs the run-time is likely to be dominated by
44
* optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
45
* the program (\sa partitioning), which is bounded (since the program uses a
46
* bounded number of registers post-regalloc) and of the order of 100. For
47
* that reason optimize_reg_permutation() is vectorized in order to keep the
48
* cubic term within reasonable bounds for m close to its theoretical maximum.
49
*/
50
51
#include "brw_fs.h"
52
#include "brw_cfg.h"
53
54
#ifdef __SSE2__
55
56
#include <emmintrin.h>
57
58
/**
59
* Thin layer around vector intrinsics so they can be easily replaced with
60
* e.g. the fall-back scalar path, an implementation with different vector
61
* width or using different SIMD architectures (AVX-512?!).
62
*
63
* This implementation operates on pairs of independent SSE2 integer vectors à
64
* la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually
65
* all platforms that care about bank conflicts, so this path should almost
66
* always be available in practice.
67
*/
68
namespace {
69
/**
70
* SIMD integer vector data type.
71
*/
72
struct vector_type {
73
__m128i v[2];
74
};
75
76
/**
77
* Scalar data type matching the representation of a single component of \p
78
* vector_type.
79
*/
80
typedef int16_t scalar_type;
81
82
/**
83
* Maximum integer value representable as a \p scalar_type.
84
*/
85
const scalar_type max_scalar = INT16_MAX;
86
87
/**
88
* Number of components of a \p vector_type.
89
*/
90
const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
91
92
/**
93
* Set the i-th component of vector \p v to \p x.
94
*/
95
void
96
set(vector_type &v, unsigned i, scalar_type x)
97
{
98
assert(i < vector_width);
99
memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
100
}
101
102
/**
103
* Get the i-th component of vector \p v.
104
*/
105
scalar_type
106
get(const vector_type &v, unsigned i)
107
{
108
assert(i < vector_width);
109
scalar_type x;
110
memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
111
return x;
112
}
113
114
/**
115
* Add two vectors with saturation.
116
*/
117
vector_type
118
adds(const vector_type &v, const vector_type &w)
119
{
120
const vector_type u = {{
121
_mm_adds_epi16(v.v[0], w.v[0]),
122
_mm_adds_epi16(v.v[1], w.v[1])
123
}};
124
return u;
125
}
126
127
/**
128
* Subtract two vectors with saturation.
129
*/
130
vector_type
131
subs(const vector_type &v, const vector_type &w)
132
{
133
const vector_type u = {{
134
_mm_subs_epi16(v.v[0], w.v[0]),
135
_mm_subs_epi16(v.v[1], w.v[1])
136
}};
137
return u;
138
}
139
140
/**
141
* Compute the bitwise conjunction of two vectors.
142
*/
143
vector_type
144
mask(const vector_type &v, const vector_type &w)
145
{
146
const vector_type u = {{
147
_mm_and_si128(v.v[0], w.v[0]),
148
_mm_and_si128(v.v[1], w.v[1])
149
}};
150
return u;
151
}
152
153
/**
154
* Reduce the components of a vector using saturating addition.
155
*/
156
scalar_type
157
sums(const vector_type &v)
158
{
159
const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
160
const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
161
const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
162
const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
163
return _mm_extract_epi16(v1, 0);
164
}
165
}
166
167
#else
168
169
/**
170
* Thin layer around vector intrinsics so they can be easily replaced with
171
* e.g. the fall-back scalar path, an implementation with different vector
172
* width or using different SIMD architectures (AVX-512?!).
173
*
174
* This implementation operates on scalar values and doesn't rely on
175
* any vector extensions. This is mainly intended for debugging and
176
* to keep this file building on exotic platforms.
177
*/
178
namespace {
179
/**
180
* SIMD integer vector data type.
181
*/
182
typedef int16_t vector_type;
183
184
/**
185
* Scalar data type matching the representation of a single component of \p
186
* vector_type.
187
*/
188
typedef int16_t scalar_type;
189
190
/**
191
* Maximum integer value representable as a \p scalar_type.
192
*/
193
const scalar_type max_scalar = INT16_MAX;
194
195
/**
196
* Number of components of a \p vector_type.
197
*/
198
const unsigned vector_width = 1;
199
200
/**
201
* Set the i-th component of vector \p v to \p x.
202
*/
203
void
204
set(vector_type &v, unsigned i, scalar_type x)
205
{
206
assert(i < vector_width);
207
v = x;
208
}
209
210
/**
211
* Get the i-th component of vector \p v.
212
*/
213
scalar_type
214
get(const vector_type &v, unsigned i)
215
{
216
assert(i < vector_width);
217
return v;
218
}
219
220
/**
221
* Add two vectors with saturation.
222
*/
223
vector_type
224
adds(vector_type v, vector_type w)
225
{
226
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
227
}
228
229
/**
230
* Substract two vectors with saturation.
231
*/
232
vector_type
233
subs(vector_type v, vector_type w)
234
{
235
return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
236
}
237
238
/**
239
* Compute the bitwise conjunction of two vectors.
240
*/
241
vector_type
242
mask(vector_type v, vector_type w)
243
{
244
return v & w;
245
}
246
247
/**
248
* Reduce the components of a vector using saturating addition.
249
*/
250
scalar_type
251
sums(vector_type v)
252
{
253
return v;
254
}
255
}
256
257
#endif
258
259
/**
260
* Swap \p x and \p y.
261
*/
262
#define SWAP(x, y) do { \
263
__typeof(y) _swap_tmp = y; \
264
y = x; \
265
x = _swap_tmp; \
266
} while (0)
267
268
namespace {
269
/**
270
* Variable-length vector type intended to represent cycle-count costs for
271
* arbitrary atom-to-bank assignments. It's indexed by a pair of integers
272
* (i, p), where i is an atom index and p in {0, 1} indicates the parity of
273
* the conflict (respectively, whether the cost is incurred whenever the
274
* atoms are assigned the same bank b or opposite-parity banks b and b^1).
275
* \sa shader_conflict_weight_matrix()
276
*/
277
struct weight_vector_type {
278
weight_vector_type() : v(NULL), size(0) {}
279
280
weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
281
282
weight_vector_type(const weight_vector_type &u) :
283
v(alloc(u.size)), size(u.size)
284
{
285
memcpy(v, u.v,
286
DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
287
}
288
289
~weight_vector_type()
290
{
291
free(v);
292
}
293
294
weight_vector_type &
295
operator=(weight_vector_type u)
296
{
297
SWAP(v, u.v);
298
SWAP(size, u.size);
299
return *this;
300
}
301
302
vector_type *v;
303
unsigned size;
304
305
private:
306
static vector_type *
307
alloc(unsigned n)
308
{
309
const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
310
const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
311
void *p;
312
if (posix_memalign(&p, align, size))
313
return NULL;
314
memset(p, 0, size);
315
return reinterpret_cast<vector_type *>(p);
316
}
317
};
318
319
/**
320
* Set the (i, p)-th component of weight vector \p v to \p x.
321
*/
322
void
323
set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
324
{
325
set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
326
}
327
328
/**
329
* Get the (i, p)-th component of weight vector \p v.
330
*/
331
scalar_type
332
get(const weight_vector_type &v, unsigned i, unsigned p)
333
{
334
return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
335
}
336
337
/**
338
* Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
339
*/
340
void
341
swap(weight_vector_type &v,
342
unsigned i, unsigned p,
343
unsigned j, unsigned q)
344
{
345
const scalar_type tmp = get(v, i, p);
346
set(v, i, p, get(v, j, q));
347
set(v, j, q, tmp);
348
}
349
}
350
351
namespace {
352
/**
353
* Object that represents the partitioning of an arbitrary register space
354
* into indivisible units (referred to as atoms below) that can potentially
355
* be rearranged independently from other registers. The partitioning is
356
* inferred from a number of contiguity requirements specified using
357
* require_contiguous(). This allows efficient look-up of the atom index a
358
* given register address belongs to, or conversely the range of register
359
* addresses that belong to a given atom.
360
*/
361
struct partitioning {
362
/**
363
* Create a (for the moment unrestricted) partitioning of a register
364
* file of size \p n. The units are arbitrary.
365
*/
366
partitioning(unsigned n) :
367
max_reg(n),
368
offsets(new unsigned[n + num_terminator_atoms]),
369
atoms(new unsigned[n + num_terminator_atoms])
370
{
371
for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
372
offsets[i] = i;
373
atoms[i] = i;
374
}
375
}
376
377
partitioning(const partitioning &p) :
378
max_reg(p.max_reg),
379
offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
380
atoms(new unsigned[p.max_reg + num_terminator_atoms])
381
{
382
memcpy(offsets, p.offsets,
383
sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
384
memcpy(atoms, p.atoms,
385
sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
386
}
387
388
~partitioning()
389
{
390
delete[] offsets;
391
delete[] atoms;
392
}
393
394
partitioning &
395
operator=(partitioning p)
396
{
397
SWAP(max_reg, p.max_reg);
398
SWAP(offsets, p.offsets);
399
SWAP(atoms, p.atoms);
400
return *this;
401
}
402
403
/**
404
* Require register range [reg, reg + n[ to be considered part of the
405
* same atom.
406
*/
407
void
408
require_contiguous(unsigned reg, unsigned n)
409
{
410
unsigned r = atoms[reg];
411
412
/* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
413
* case that the specified contiguity requirement leads to the fusion
414
* (yay) of one or more existing atoms.
415
*/
416
for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
417
if (offsets[atoms[reg1]] < reg + n) {
418
atoms[reg1] = r;
419
} else {
420
if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
421
r++;
422
423
offsets[r] = offsets[atoms[reg1]];
424
atoms[reg1] = r;
425
}
426
}
427
}
428
429
/**
430
* Get the atom index register address \p reg belongs to.
431
*/
432
unsigned
433
atom_of_reg(unsigned reg) const
434
{
435
return atoms[reg];
436
}
437
438
/**
439
* Get the base register address that belongs to atom \p r.
440
*/
441
unsigned
442
reg_of_atom(unsigned r) const
443
{
444
return offsets[r];
445
}
446
447
/**
448
* Get the size of atom \p r in register address units.
449
*/
450
unsigned
451
size_of_atom(unsigned r) const
452
{
453
assert(r < num_atoms());
454
return reg_of_atom(r + 1) - reg_of_atom(r);
455
}
456
457
/**
458
* Get the number of atoms the whole register space is partitioned into.
459
*/
460
unsigned
461
num_atoms() const
462
{
463
return atoms[max_reg];
464
}
465
466
private:
467
/**
468
* Number of trailing atoms inserted for convenience so among other
469
* things we don't need to special-case the last element in
470
* size_of_atom().
471
*/
472
static const unsigned num_terminator_atoms = 1;
473
unsigned max_reg;
474
unsigned *offsets;
475
unsigned *atoms;
476
};
477
478
/**
479
* Only GRF sources (whether they have been register-allocated or not) can
480
* possibly incur bank conflicts.
481
*/
482
bool
483
is_grf(const fs_reg &r)
484
{
485
return r.file == VGRF || r.file == FIXED_GRF;
486
}
487
488
/**
489
* Register offset of \p r in GRF units. Useful because the representation
490
* of GRFs post-register allocation is somewhat inconsistent and depends on
491
* whether the register already had a fixed GRF offset prior to register
492
* allocation or whether it was part of a VGRF allocation.
493
*/
494
unsigned
495
reg_of(const fs_reg &r)
496
{
497
assert(is_grf(r));
498
if (r.file == VGRF)
499
return r.nr + r.offset / REG_SIZE;
500
else
501
return reg_offset(r) / REG_SIZE;
502
}
503
504
/**
505
* Calculate the finest partitioning of the GRF space compatible with the
506
* register contiguity requirements derived from all instructions part of
507
* the program.
508
*/
509
partitioning
510
shader_reg_partitioning(const fs_visitor *v)
511
{
512
partitioning p(BRW_MAX_GRF);
513
514
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
515
if (is_grf(inst->dst))
516
p.require_contiguous(reg_of(inst->dst), regs_written(inst));
517
518
for (int i = 0; i < inst->sources; i++) {
519
if (is_grf(inst->src[i]))
520
p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
521
}
522
}
523
524
return p;
525
}
526
527
/**
528
* Return the set of GRF atoms that should be left untouched at their
529
* original location to avoid violating hardware or software assumptions.
530
*/
531
bool *
532
shader_reg_constraints(const fs_visitor *v, const partitioning &p)
533
{
534
bool *constrained = new bool[p.num_atoms()]();
535
536
/* These are read implicitly by some send-message instructions without
537
* any indication at the IR level. Assume they are unsafe to move
538
* around.
539
*/
540
for (unsigned reg = 0; reg < 2; reg++)
541
constrained[p.atom_of_reg(reg)] = true;
542
543
/* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
544
* subsection "EUISA Instructions", Send Message (page 990):
545
*
546
* "r127 must not be used for return address when there is a src and
547
* dest overlap in send instruction."
548
*
549
* Register allocation ensures that, so don't move 127 around to avoid
550
* breaking that property.
551
*/
552
if (v->devinfo->ver >= 8)
553
constrained[p.atom_of_reg(127)] = true;
554
555
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
556
/* Assume that anything referenced via fixed GRFs is baked into the
557
* hardware's fixed-function logic and may be unsafe to move around.
558
* Also take into account the source GRF restrictions of EOT
559
* send-message instructions.
560
*/
561
if (inst->dst.file == FIXED_GRF)
562
constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
563
564
for (int i = 0; i < inst->sources; i++) {
565
if (inst->src[i].file == FIXED_GRF ||
566
(is_grf(inst->src[i]) && inst->eot))
567
constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
568
}
569
570
/* Preserve the original allocation of VGRFs used by the barycentric
571
* source of the LINTERP instruction on Gfx6, since pair-aligned
572
* barycentrics allow the PLN instruction to be used.
573
*/
574
if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
575
inst->opcode == FS_OPCODE_LINTERP)
576
constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
577
578
/* The location of the Gfx7 MRF hack registers is hard-coded in the
579
* rest of the compiler back-end. Don't attempt to move them around.
580
*/
581
if (v->devinfo->ver >= 7) {
582
assert(inst->dst.file != MRF);
583
584
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
585
const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
586
constrained[p.atom_of_reg(reg)] = true;
587
}
588
}
589
}
590
591
return constrained;
592
}
593
594
/**
595
* Return whether the hardware will be able to prevent a bank conflict by
596
* optimizing out the read cycle of a source register. The formula was
597
* found experimentally.
598
*/
599
bool
600
is_conflict_optimized_out(const intel_device_info *devinfo,
601
const fs_inst *inst)
602
{
603
return devinfo->ver >= 9 &&
604
((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
605
reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
606
reg_of(inst->src[1]) == reg_of(inst->src[2]));
607
}
608
609
/**
610
* Return a matrix that allows reasonably efficient computation of the
611
* cycle-count cost of bank conflicts incurred throughout the whole program
612
* for any given atom-to-bank assignment.
613
*
614
* More precisely, if C_r_s_p is the result of this function, the total
615
* cost of all bank conflicts involving any given atom r can be readily
616
* recovered as follows:
617
*
618
* S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
619
*
620
* where d_i_j is the Kronecker delta, and B_r indicates the bank
621
* assignment of r. \sa delta_conflicts() for a vectorized implementation
622
* of the expression above.
623
*
624
* FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
625
* somewhat more relaxed than on previous generations. In the
626
* meantime optimizing based on Gfx9 weights is likely to be more
627
* helpful than not optimizing at all.
628
*/
629
weight_vector_type *
630
shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
631
{
632
weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
633
for (unsigned r = 0; r < p.num_atoms(); r++)
634
conflicts[r] = weight_vector_type(2 * p.num_atoms());
635
636
/* Crude approximation of the number of times the current basic block
637
* will be executed at run-time.
638
*/
639
unsigned block_scale = 1;
640
641
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
642
if (inst->opcode == BRW_OPCODE_DO) {
643
block_scale *= 10;
644
645
} else if (inst->opcode == BRW_OPCODE_WHILE) {
646
block_scale /= 10;
647
648
} else if (inst->is_3src(v->devinfo) &&
649
is_grf(inst->src[1]) && is_grf(inst->src[2])) {
650
const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
651
const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
652
653
/* Estimate of the cycle-count cost of incurring a bank conflict
654
* for this instruction. This is only true on the average, for a
655
* sequence of back-to-back ternary instructions, since the EU
656
* front-end only seems to be able to issue a new instruction at
657
* an even cycle. The cost of a bank conflict incurred by an
658
* isolated ternary instruction may be higher.
659
*/
660
const unsigned exec_size = inst->dst.component_size(inst->exec_size);
661
const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
662
REG_SIZE);
663
664
/* Neglect same-atom conflicts (since they're either trivial or
665
* impossible to avoid without splitting the atom), and conflicts
666
* known to be optimized out by the hardware.
667
*/
668
if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
669
/* Calculate the parity of the sources relative to the start of
670
* their respective atoms. If their parity is the same (and
671
* none of the atoms straddle the 2KB mark), the instruction
672
* will incur a conflict iff both atoms are assigned the same
673
* bank b. If their parity is opposite, the instruction will
674
* incur a conflict iff they are assigned opposite banks (b and
675
* b^1).
676
*/
677
const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
678
const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
679
const unsigned p = p_r ^ p_s;
680
681
/* Calculate the updated cost of a hypothetical conflict
682
* between atoms r and s. Note that the weight matrix is
683
* symmetric with respect to indices r and s by construction.
684
*/
685
const scalar_type w = MIN2(unsigned(max_scalar),
686
get(conflicts[r], s, p) + cycle_scale);
687
set(conflicts[r], s, p, w);
688
set(conflicts[s], r, p, w);
689
}
690
}
691
}
692
693
return conflicts;
694
}
695
696
/**
697
* Return the set of GRF atoms that could potentially lead to bank
698
* conflicts if laid out unfavorably in the GRF space according to
699
* the specified \p conflicts matrix (\sa
700
* shader_conflict_weight_matrix()).
701
*/
702
bool *
703
have_any_conflicts(const partitioning &p,
704
const weight_vector_type *conflicts)
705
{
706
bool *any_conflicts = new bool[p.num_atoms()]();
707
708
for (unsigned r = 0; r < p.num_atoms(); r++) {
709
const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
710
for (unsigned s = 0; s < m; s++)
711
any_conflicts[r] |= sums(conflicts[r].v[s]);
712
}
713
714
return any_conflicts;
715
}
716
717
/**
718
* Calculate the difference between two S(B) cost estimates as defined
719
* above (\sa shader_conflict_weight_matrix()). This represents the
720
* (partial) cycle-count benefit from moving an atom r from bank p to n.
721
* The respective bank assignments Bp and Bn are encoded as the \p
722
* bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
723
* according to the formula:
724
*
725
* bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
726
*
727
* Notice the similarity with the delta function in the S(B) expression
728
* above, and how bank_mask(B) can be precomputed for every possible
729
* selection of r since bank_mask(B) only depends on it via B_r that may
730
* only assume one of four different values, so the caller can keep every
731
* possible bank_mask(B) vector in memory without much hassle (\sa
732
* bank_characteristics()).
733
*/
734
int
735
delta_conflicts(const weight_vector_type &bank_mask_p,
736
const weight_vector_type &bank_mask_n,
737
const weight_vector_type &conflicts)
738
{
739
const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
740
vector_type s_p = {}, s_n = {};
741
742
for (unsigned r = 0; r < m; r++) {
743
s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
744
s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
745
}
746
747
return sums(subs(s_p, s_n));
748
}
749
750
/**
751
* Register atom permutation, represented as the start GRF offset each atom
752
* is mapped into.
753
*/
754
struct permutation {
755
permutation() : v(NULL), size(0) {}
756
757
permutation(unsigned n) :
758
v(new unsigned[n]()), size(n) {}
759
760
permutation(const permutation &p) :
761
v(new unsigned[p.size]), size(p.size)
762
{
763
memcpy(v, p.v, p.size * sizeof(unsigned));
764
}
765
766
~permutation()
767
{
768
delete[] v;
769
}
770
771
permutation &
772
operator=(permutation p)
773
{
774
SWAP(v, p.v);
775
SWAP(size, p.size);
776
return *this;
777
}
778
779
unsigned *v;
780
unsigned size;
781
};
782
783
/**
784
* Return an identity permutation of GRF atoms.
785
*/
786
permutation
787
identity_reg_permutation(const partitioning &p)
788
{
789
permutation map(p.num_atoms());
790
791
for (unsigned r = 0; r < map.size; r++)
792
map.v[r] = p.reg_of_atom(r);
793
794
return map;
795
}
796
797
/**
798
* Return the bank index of GRF address \p reg, numbered according to the
799
* table:
800
* Even Odd
801
* Lo 0 1
802
* Hi 2 3
803
*/
804
unsigned
805
bank_of(unsigned reg)
806
{
807
return (reg & 0x40) >> 5 | (reg & 1);
808
}
809
810
/**
811
* Return bitmasks suitable for use as bank mask arguments for the
812
* delta_conflicts() computation. Note that this is just the (negative)
813
* characteristic function of each bank, if you regard it as a set
814
* containing all atoms assigned to it according to the \p map array.
815
*/
816
weight_vector_type *
817
bank_characteristics(const permutation &map)
818
{
819
weight_vector_type *banks = new weight_vector_type[4];
820
821
for (unsigned b = 0; b < 4; b++) {
822
banks[b] = weight_vector_type(2 * map.size);
823
824
for (unsigned j = 0; j < map.size; j++) {
825
for (unsigned p = 0; p < 2; p++)
826
set(banks[b], j, p,
827
(b ^ p) == bank_of(map.v[j]) ? -1 : 0);
828
}
829
}
830
831
return banks;
832
}
833
834
/**
835
* Return an improved permutation of GRF atoms based on \p map attempting
836
* to reduce the total cycle-count cost of bank conflicts greedily.
837
*
838
* Note that this doesn't attempt to merge multiple atoms into one, which
839
* may allow it to do a better job in some cases -- It simply reorders
840
* existing atoms in the GRF space without affecting their identity.
841
*/
842
permutation
843
optimize_reg_permutation(const partitioning &p,
844
const bool *constrained,
845
const weight_vector_type *conflicts,
846
permutation map)
847
{
848
const bool *any_conflicts = have_any_conflicts(p, conflicts);
849
weight_vector_type *banks = bank_characteristics(map);
850
851
for (unsigned r = 0; r < map.size; r++) {
852
const unsigned bank_r = bank_of(map.v[r]);
853
854
if (!constrained[r]) {
855
unsigned best_s = r;
856
int best_benefit = 0;
857
858
for (unsigned s = 0; s < map.size; s++) {
859
const unsigned bank_s = bank_of(map.v[s]);
860
861
if (bank_r != bank_s && !constrained[s] &&
862
p.size_of_atom(r) == p.size_of_atom(s) &&
863
(any_conflicts[r] || any_conflicts[s])) {
864
const int benefit =
865
delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
866
delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
867
868
if (benefit > best_benefit) {
869
best_s = s;
870
best_benefit = benefit;
871
}
872
}
873
}
874
875
if (best_s != r) {
876
for (unsigned b = 0; b < 4; b++) {
877
for (unsigned p = 0; p < 2; p++)
878
swap(banks[b], r, p, best_s, p);
879
}
880
881
SWAP(map.v[r], map.v[best_s]);
882
}
883
}
884
}
885
886
delete[] banks;
887
delete[] any_conflicts;
888
return map;
889
}
890
891
/**
892
* Apply the GRF atom permutation given by \p map to register \p r and
893
* return the result.
894
*/
895
fs_reg
896
transform(const partitioning &p, const permutation &map, fs_reg r)
897
{
898
if (r.file == VGRF) {
899
const unsigned reg = reg_of(r);
900
const unsigned s = p.atom_of_reg(reg);
901
r.nr = map.v[s] + reg - p.reg_of_atom(s);
902
r.offset = r.offset % REG_SIZE;
903
}
904
905
return r;
906
}
907
}
908
909
bool
910
fs_visitor::opt_bank_conflicts()
911
{
912
assert(grf_used || !"Must be called after register allocation");
913
914
/* No ternary instructions -- No bank conflicts. */
915
if (devinfo->ver < 6)
916
return false;
917
918
const partitioning p = shader_reg_partitioning(this);
919
const bool *constrained = shader_reg_constraints(this, p);
920
const weight_vector_type *conflicts =
921
shader_conflict_weight_matrix(this, p);
922
const permutation map =
923
optimize_reg_permutation(p, constrained, conflicts,
924
identity_reg_permutation(p));
925
926
foreach_block_and_inst(block, fs_inst, inst, cfg) {
927
inst->dst = transform(p, map, inst->dst);
928
929
for (int i = 0; i < inst->sources; i++)
930
inst->src[i] = transform(p, map, inst->src[i]);
931
}
932
933
delete[] conflicts;
934
delete[] constrained;
935
return true;
936
}
937
938
/**
939
* Return whether the instruction incurs GRF bank conflict cycles.
940
*
941
* Note that this is only accurate after register allocation because otherwise
942
* we don't know which bank each VGRF is going to end up aligned to.
943
*/
944
bool
945
has_bank_conflict(const intel_device_info *devinfo, const fs_inst *inst)
946
{
947
return inst->is_3src(devinfo) &&
948
is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
949
bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
950
!is_conflict_optimized_out(devinfo, inst);
951
}
952
953