Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_scoreboard.cpp
4550 views
1
/*
2
* Copyright © 2019 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file brw_fs_scoreboard.cpp
25
*
26
* Gfx12+ hardware lacks the register scoreboard logic that used to guarantee
27
* data coherency between register reads and writes in previous generations.
28
* This lowering pass runs after register allocation in order to make up for
29
* it.
30
*
31
* It works by performing global dataflow analysis in order to determine the
32
* set of potential dependencies of every instruction in the shader, and then
33
* inserts any required SWSB annotations and additional SYNC instructions in
34
* order to guarantee data coherency.
35
*
36
* WARNING - Access of the following (rarely used) ARF registers is not
37
* tracked here, and require the RegDist SWSB annotation to be set
38
* to 1 by the generator in order to avoid data races:
39
*
40
* - sp stack pointer
41
* - sr0 state register
42
* - cr0 control register
43
* - ip instruction pointer
44
* - tm0 timestamp register
45
* - dbg0 debug register
46
* - acc2-9 special accumulator registers on TGL
47
* - mme0-7 math macro extended accumulator registers
48
*
49
* The following ARF registers don't need to be tracked here because data
50
* coherency is still provided transparently by the hardware:
51
*
52
* - f0-1 flag registers
53
* - n0 notification register
54
* - tdr0 thread dependency register
55
*/
56
57
#include "brw_fs.h"
58
#include "brw_cfg.h"
59
60
using namespace brw;
61
62
namespace {
63
/**
64
* In-order instruction accounting.
65
* @{
66
*/
67
68
/**
69
* Return the RegDist pipeline the hardware will synchronize with if no
70
* pipeline information is provided in the SWSB annotation of an
71
* instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb).
72
*/
73
tgl_pipe
74
inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
75
{
76
if (devinfo->verx10 >= 125) {
77
bool has_int_src = false, has_long_src = false;
78
79
if (is_send(inst))
80
return TGL_PIPE_NONE;
81
82
for (unsigned i = 0; i < inst->sources; i++) {
83
if (inst->src[i].file != BAD_FILE &&
84
!inst->is_control_source(i)) {
85
const brw_reg_type t = inst->src[i].type;
86
has_int_src |= !brw_reg_type_is_floating_point(t);
87
has_long_src |= type_sz(t) >= 8;
88
}
89
}
90
91
return has_long_src ? TGL_PIPE_LONG :
92
has_int_src ? TGL_PIPE_INT :
93
TGL_PIPE_FLOAT;
94
95
} else {
96
return TGL_PIPE_FLOAT;
97
}
98
}
99
100
/**
101
* Return the RegDist pipeline that will execute an instruction, or
102
* TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the
103
* RegDist synchronization mechanism.
104
*/
105
tgl_pipe
106
inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst)
107
{
108
const brw_reg_type t = get_exec_type(inst);
109
const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) &&
110
((inst->opcode == BRW_OPCODE_MUL &&
111
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
112
(inst->opcode == BRW_OPCODE_MAD &&
113
MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
114
115
if (is_unordered(inst))
116
return TGL_PIPE_NONE;
117
else if (devinfo->verx10 < 125)
118
return TGL_PIPE_FLOAT;
119
else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
120
type_sz(t) >= 8)
121
return TGL_PIPE_INT;
122
else if (inst->opcode == SHADER_OPCODE_BROADCAST &&
123
!devinfo->has_64bit_float && type_sz(t) >= 8)
124
return TGL_PIPE_INT;
125
else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 ||
126
is_dword_multiply)
127
return TGL_PIPE_LONG;
128
else if (brw_reg_type_is_floating_point(inst->dst.type))
129
return TGL_PIPE_FLOAT;
130
else
131
return TGL_PIPE_INT;
132
}
133
134
/**
135
* Index of the \p p pipeline counter in the ordered_address vector defined
136
* below.
137
*/
138
#define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \
139
(abort(), ~0u))
140
141
/**
142
* Number of in-order hardware instructions for pipeline index \p contained
143
* in this IR instruction. This determines the increment applied to the
144
* RegDist counter calculated for any ordered dependency that crosses this
145
* instruction.
146
*/
147
unsigned
148
ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst,
149
unsigned p)
150
{
151
switch (inst->opcode) {
152
case BRW_OPCODE_SYNC:
153
case BRW_OPCODE_DO:
154
case SHADER_OPCODE_UNDEF:
155
case SHADER_OPCODE_HALT_TARGET:
156
case FS_OPCODE_SCHEDULING_FENCE:
157
return 0;
158
default:
159
/* Note that the following is inaccurate for virtual instructions
160
* that expand to more in-order instructions than assumed here, but
161
* that can only lead to suboptimal execution ordering, data
162
* coherency won't be impacted. Providing exact RegDist counts for
163
* each virtual instruction would allow better ALU performance, but
164
* it would require keeping this switch statement in perfect sync
165
* with the generator in order to avoid data corruption. Lesson is
166
* (again) don't use virtual instructions if you want optimal
167
* scheduling.
168
*/
169
if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) ||
170
p == IDX(TGL_PIPE_ALL)))
171
return 1;
172
else
173
return 0;
174
}
175
}
176
177
/**
178
* Type for an instruction counter that increments for in-order
179
* instructions only, arbitrarily denoted 'jp' throughout this lowering
180
* pass in order to distinguish it from the regular instruction counter.
181
* This is represented as a vector with an independent counter for each
182
* asynchronous ALU pipeline in the EU.
183
*/
184
struct ordered_address {
185
/**
186
* Construct the ordered address of a dependency known to execute on a
187
* single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL
188
* is provided), in which case the vector counter will be initialized
189
* with all components equal to INT_MIN (always satisfied) except for
190
* component IDX(p).
191
*/
192
ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) {
193
for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++)
194
jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ?
195
INT_MIN : jp0);
196
}
197
198
int jp[IDX(TGL_PIPE_ALL)];
199
200
friend bool
201
operator==(const ordered_address &jp0, const ordered_address &jp1)
202
{
203
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
204
if (jp0.jp[p] != jp1.jp[p])
205
return false;
206
}
207
208
return true;
209
}
210
};
211
212
/**
213
* Return true if the specified ordered address is trivially satisfied for
214
* all pipelines except potentially for the specified pipeline \p p.
215
*/
216
bool
217
is_single_pipe(const ordered_address &jp, tgl_pipe p)
218
{
219
for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
220
if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
221
return false;
222
}
223
224
return true;
225
}
226
227
/**
228
* Return the number of instructions in the program.
229
*/
230
unsigned
231
num_instructions(const backend_shader *shader)
232
{
233
return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1;
234
}
235
236
/**
237
* Calculate the local ordered_address instruction counter at every
238
* instruction of the shader for subsequent constant-time look-up.
239
*/
240
ordered_address *
241
ordered_inst_addresses(const fs_visitor *shader)
242
{
243
ordered_address *jps = new ordered_address[num_instructions(shader)];
244
ordered_address jp(TGL_PIPE_ALL, 0);
245
unsigned ip = 0;
246
247
foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
248
jps[ip] = jp;
249
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
250
jp.jp[p] += ordered_unit(shader->devinfo, inst, p);
251
ip++;
252
}
253
254
return jps;
255
}
256
257
/**
258
* Synchronization mode required for data manipulated by in-order
259
* instructions.
260
*
261
* Similar to tgl_sbid_mode, but without SET mode. Defined as a separate
262
* enum for additional type safety. The hardware doesn't provide control
263
* over the synchronization mode for RegDist annotations, this is only used
264
* internally in this pass in order to optimize out redundant read
265
* dependencies where possible.
266
*/
267
enum tgl_regdist_mode {
268
TGL_REGDIST_NULL = 0,
269
TGL_REGDIST_SRC = 1,
270
TGL_REGDIST_DST = 2
271
};
272
273
/**
274
* Allow bitwise arithmetic of tgl_regdist_mode enums.
275
*/
276
tgl_regdist_mode
277
operator|(tgl_regdist_mode x, tgl_regdist_mode y)
278
{
279
return tgl_regdist_mode(unsigned(x) | unsigned(y));
280
}
281
282
tgl_regdist_mode
283
operator&(tgl_regdist_mode x, tgl_regdist_mode y)
284
{
285
return tgl_regdist_mode(unsigned(x) & unsigned(y));
286
}
287
288
tgl_regdist_mode &
289
operator|=(tgl_regdist_mode &x, tgl_regdist_mode y)
290
{
291
return x = x | y;
292
}
293
294
tgl_regdist_mode &
295
operator&=(tgl_regdist_mode &x, tgl_regdist_mode y)
296
{
297
return x = x & y;
298
}
299
300
/** @} */
301
302
/**
303
* Representation of an equivalence relation among the set of unsigned
304
* integers.
305
*
306
* Its initial state is the identity relation '~' such that i ~ j if and
307
* only if i == j for every pair of unsigned integers i and j.
308
*/
309
struct equivalence_relation {
310
equivalence_relation(unsigned n) : is(new unsigned[n]), n(n)
311
{
312
for (unsigned i = 0; i < n; i++)
313
is[i] = i;
314
}
315
316
~equivalence_relation()
317
{
318
delete[] is;
319
}
320
321
/**
322
* Return equivalence class index of the specified element. Effectively
323
* this is the numeric value of an arbitrary representative from the
324
* equivalence class.
325
*
326
* Allows the evaluation of the equivalence relation according to the
327
* rule that i ~ j if and only if lookup(i) == lookup(j).
328
*/
329
unsigned
330
lookup(unsigned i) const
331
{
332
if (i < n && is[i] != i)
333
return lookup(is[i]);
334
else
335
return i;
336
}
337
338
/**
339
* Create an array with the results of the lookup() method for
340
* constant-time evaluation.
341
*/
342
unsigned *
343
flatten() const
344
{
345
unsigned *ids = new unsigned[n];
346
347
for (unsigned i = 0; i < n; i++)
348
ids[i] = lookup(i);
349
350
return ids;
351
}
352
353
/**
354
* Mutate the existing equivalence relation minimally by imposing the
355
* additional requirement that i ~ j.
356
*
357
* The algorithm updates the internal representation recursively in
358
* order to guarantee transitivity while preserving the previously
359
* specified equivalence requirements.
360
*/
361
unsigned
362
link(unsigned i, unsigned j)
363
{
364
const unsigned k = lookup(i);
365
assign(i, k);
366
assign(j, k);
367
return k;
368
}
369
370
private:
371
equivalence_relation(const equivalence_relation &);
372
373
equivalence_relation &
374
operator=(const equivalence_relation &);
375
376
/**
377
* Assign the representative of \p from to be equivalent to \p to.
378
*
379
* At the same time the data structure is partially flattened as much as
380
* it's possible without increasing the number of recursive calls.
381
*/
382
void
383
assign(unsigned from, unsigned to)
384
{
385
if (from != to) {
386
assert(from < n);
387
388
if (is[from] != from)
389
assign(is[from], to);
390
391
is[from] = to;
392
}
393
}
394
395
unsigned *is;
396
unsigned n;
397
};
398
399
/**
400
* Representation of a data dependency between two instructions in the
401
* program.
402
* @{
403
*/
404
struct dependency {
405
/**
406
* No dependency information.
407
*/
408
dependency() : ordered(TGL_REGDIST_NULL), jp(),
409
unordered(TGL_SBID_NULL), id(0),
410
exec_all(false) {}
411
412
/**
413
* Construct a dependency on the in-order instruction with the provided
414
* ordered_address instruction counter.
415
*/
416
dependency(tgl_regdist_mode mode, const ordered_address &jp,
417
bool exec_all) :
418
ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0),
419
exec_all(exec_all) {}
420
421
/**
422
* Construct a dependency on the out-of-order instruction with the
423
* specified synchronization token.
424
*/
425
dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) :
426
ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id),
427
exec_all(exec_all) {}
428
429
/**
430
* Synchronization mode of in-order dependency, or zero if no in-order
431
* dependency is present.
432
*/
433
tgl_regdist_mode ordered;
434
435
/**
436
* Instruction counter of in-order dependency.
437
*
438
* For a dependency part of a different block in the program, this is
439
* relative to the specific control flow path taken between the
440
* dependency and the current block: It is the ordered_address such that
441
* the difference between it and the ordered_address of the first
442
* instruction of the current block is exactly the number of in-order
443
* instructions across that control flow path. It is not guaranteed to
444
* be equal to the local ordered_address of the generating instruction
445
* [as returned by ordered_inst_addresses()], except for block-local
446
* dependencies.
447
*/
448
ordered_address jp;
449
450
/**
451
* Synchronization mode of unordered dependency, or zero if no unordered
452
* dependency is present.
453
*/
454
tgl_sbid_mode unordered;
455
456
/** Synchronization token of out-of-order dependency. */
457
unsigned id;
458
459
/**
460
* Whether the dependency could be run with execution masking disabled,
461
* which might lead to the unwanted execution of the generating
462
* instruction in cases where a BB is executed with all channels
463
* disabled due to hardware bug Wa_1407528679.
464
*/
465
bool exec_all;
466
467
/**
468
* Trivial in-order dependency that's always satisfied.
469
*
470
* Note that unlike a default-constructed dependency() which is also
471
* trivially satisfied, this is considered to provide dependency
472
* information and can be used to clear a previously pending dependency
473
* via shadow().
474
*/
475
static const dependency done;
476
477
friend bool
478
operator==(const dependency &dep0, const dependency &dep1)
479
{
480
return dep0.ordered == dep1.ordered &&
481
dep0.jp == dep1.jp &&
482
dep0.unordered == dep1.unordered &&
483
dep0.id == dep1.id &&
484
dep0.exec_all == dep1.exec_all;
485
}
486
487
friend bool
488
operator!=(const dependency &dep0, const dependency &dep1)
489
{
490
return !(dep0 == dep1);
491
}
492
};
493
494
const dependency dependency::done =
495
dependency(TGL_REGDIST_SRC, ordered_address(), false);
496
497
/**
498
* Return whether \p dep contains any dependency information.
499
*/
500
bool
501
is_valid(const dependency &dep)
502
{
503
return dep.ordered || dep.unordered;
504
}
505
506
/**
507
* Combine \p dep0 and \p dep1 into a single dependency object that is only
508
* satisfied when both original dependencies are satisfied. This might
509
* involve updating the equivalence relation \p eq in order to make sure
510
* that both out-of-order dependencies are assigned the same hardware SBID
511
* as synchronization token.
512
*/
513
dependency
514
merge(equivalence_relation &eq,
515
const dependency &dep0, const dependency &dep1)
516
{
517
dependency dep;
518
519
if (dep0.ordered || dep1.ordered) {
520
dep.ordered = dep0.ordered | dep1.ordered;
521
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
522
dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]);
523
}
524
525
if (dep0.unordered || dep1.unordered) {
526
dep.unordered = dep0.unordered | dep1.unordered;
527
dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id,
528
dep1.unordered ? dep1.id : dep0.id);
529
}
530
531
dep.exec_all = dep0.exec_all || dep1.exec_all;
532
533
return dep;
534
}
535
536
/**
537
* Override dependency information of \p dep0 with that of \p dep1.
538
*/
539
dependency
540
shadow(const dependency &dep0, const dependency &dep1)
541
{
542
return is_valid(dep1) ? dep1 : dep0;
543
}
544
545
/**
546
* Translate dependency information across the program.
547
*
548
* This returns a dependency on the same instruction translated to the
549
* ordered_address space of a different block. The correct shift for
550
* transporting a dependency across an edge of the CFG is the difference
551
* between the local ordered_address of the first instruction of the target
552
* block and the local ordered_address of the instruction immediately after
553
* the end of the origin block.
554
*/
555
dependency
556
transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)])
557
{
558
if (dep.ordered) {
559
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) {
560
if (dep.jp.jp[p] > INT_MIN)
561
dep.jp.jp[p] += delta[p];
562
}
563
}
564
565
return dep;
566
}
567
568
/**
569
* Return simplified dependency removing any synchronization modes not
570
* applicable to an instruction reading the same register location.
571
*/
572
dependency
573
dependency_for_read(dependency dep)
574
{
575
dep.ordered &= TGL_REGDIST_DST;
576
return dep;
577
}
578
579
/**
580
* Return simplified dependency removing any synchronization modes not
581
* applicable to an instruction \p inst writing the same register location.
582
*
583
* This clears any WaR dependency for writes performed from the same
584
* pipeline as the read, since there is no possibility for a data hazard.
585
*/
586
dependency
587
dependency_for_write(const struct intel_device_info *devinfo,
588
const fs_inst *inst, dependency dep)
589
{
590
if (!is_unordered(inst) &&
591
is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst)))
592
dep.ordered &= TGL_REGDIST_DST;
593
return dep;
594
}
595
596
/** @} */
597
598
/**
599
* Scoreboard representation. This keeps track of the data dependencies of
600
* registers with GRF granularity.
601
*/
602
class scoreboard {
603
public:
604
/**
605
* Look up the most current data dependency for register \p r.
606
*/
607
dependency
608
get(const fs_reg &r) const
609
{
610
if (const dependency *p = const_cast<scoreboard *>(this)->dep(r))
611
return *p;
612
else
613
return dependency();
614
}
615
616
/**
617
* Specify the most current data dependency for register \p r.
618
*/
619
void
620
set(const fs_reg &r, const dependency &d)
621
{
622
if (dependency *p = dep(r))
623
*p = d;
624
}
625
626
/**
627
* Component-wise merge() of corresponding dependencies from two
628
* scoreboard objects. \sa merge().
629
*/
630
friend scoreboard
631
merge(equivalence_relation &eq,
632
const scoreboard &sb0, const scoreboard &sb1)
633
{
634
scoreboard sb;
635
636
for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
637
sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]);
638
639
sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep);
640
sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep);
641
642
return sb;
643
}
644
645
/**
646
* Component-wise shadow() of corresponding dependencies from two
647
* scoreboard objects. \sa shadow().
648
*/
649
friend scoreboard
650
shadow(const scoreboard &sb0, const scoreboard &sb1)
651
{
652
scoreboard sb;
653
654
for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
655
sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]);
656
657
sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep);
658
sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep);
659
660
return sb;
661
}
662
663
/**
664
* Component-wise transport() of dependencies from a scoreboard
665
* object. \sa transport().
666
*/
667
friend scoreboard
668
transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)])
669
{
670
scoreboard sb;
671
672
for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++)
673
sb.grf_deps[i] = transport(sb0.grf_deps[i], delta);
674
675
sb.addr_dep = transport(sb0.addr_dep, delta);
676
sb.accum_dep = transport(sb0.accum_dep, delta);
677
678
return sb;
679
}
680
681
friend bool
682
operator==(const scoreboard &sb0, const scoreboard &sb1)
683
{
684
for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) {
685
if (sb0.grf_deps[i] != sb1.grf_deps[i])
686
return false;
687
}
688
689
if (sb0.addr_dep != sb1.addr_dep)
690
return false;
691
692
if (sb0.accum_dep != sb1.accum_dep)
693
return false;
694
695
return true;
696
}
697
698
friend bool
699
operator!=(const scoreboard &sb0, const scoreboard &sb1)
700
{
701
return !(sb0 == sb1);
702
}
703
704
private:
705
dependency grf_deps[BRW_MAX_GRF];
706
dependency addr_dep;
707
dependency accum_dep;
708
709
dependency *
710
dep(const fs_reg &r)
711
{
712
const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE :
713
reg_offset(r) / REG_SIZE);
714
715
return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] :
716
r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] :
717
r.file == ARF && reg >= BRW_ARF_ADDRESS &&
718
reg < BRW_ARF_ACCUMULATOR ? &addr_dep :
719
r.file == ARF && reg >= BRW_ARF_ACCUMULATOR &&
720
reg < BRW_ARF_FLAG ? &accum_dep :
721
NULL);
722
}
723
};
724
725
/**
726
* Dependency list handling.
727
* @{
728
*/
729
struct dependency_list {
730
dependency_list() : deps(NULL), n(0) {}
731
732
~dependency_list()
733
{
734
free(deps);
735
}
736
737
void
738
push_back(const dependency &dep)
739
{
740
deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps));
741
deps[n++] = dep;
742
}
743
744
unsigned
745
size() const
746
{
747
return n;
748
}
749
750
const dependency &
751
operator[](unsigned i) const
752
{
753
assert(i < n);
754
return deps[i];
755
}
756
757
dependency &
758
operator[](unsigned i)
759
{
760
assert(i < n);
761
return deps[i];
762
}
763
764
private:
765
dependency_list(const dependency_list &);
766
dependency_list &
767
operator=(const dependency_list &);
768
769
dependency *deps;
770
unsigned n;
771
};
772
773
/**
774
* Add dependency \p dep to the list of dependencies of an instruction
775
* \p deps.
776
*/
777
void
778
add_dependency(const unsigned *ids, dependency_list &deps, dependency dep)
779
{
780
if (is_valid(dep)) {
781
/* Translate the unordered dependency token first in order to keep
782
* the list minimally redundant.
783
*/
784
if (dep.unordered)
785
dep.id = ids[dep.id];
786
787
/* Try to combine the specified dependency with any existing ones. */
788
for (unsigned i = 0; i < deps.size(); i++) {
789
/* Don't combine otherwise matching dependencies if there is an
790
* exec_all mismatch which would cause a SET dependency to gain an
791
* exec_all flag, since that would prevent it from being baked
792
* into the instruction we want to allocate an SBID for.
793
*/
794
if (deps[i].exec_all != dep.exec_all &&
795
(!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) &&
796
(!dep.exec_all || (deps[i].unordered & TGL_SBID_SET)))
797
continue;
798
799
if (dep.ordered && deps[i].ordered) {
800
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
801
deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]);
802
803
deps[i].ordered |= dep.ordered;
804
deps[i].exec_all |= dep.exec_all;
805
dep.ordered = TGL_REGDIST_NULL;
806
}
807
808
if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) {
809
deps[i].unordered |= dep.unordered;
810
deps[i].exec_all |= dep.exec_all;
811
dep.unordered = TGL_SBID_NULL;
812
}
813
}
814
815
/* Add it to the end of the list if necessary. */
816
if (is_valid(dep))
817
deps.push_back(dep);
818
}
819
}
820
821
/**
822
* Construct a tgl_swsb annotation encoding any ordered dependencies from
823
* the dependency list \p deps of an instruction with ordered_address \p
824
* jp. If \p exec_all is false only dependencies known to be executed with
825
* channel masking applied will be considered in the calculation.
826
*/
827
tgl_swsb
828
ordered_dependency_swsb(const dependency_list &deps,
829
const ordered_address &jp,
830
bool exec_all)
831
{
832
tgl_pipe p = TGL_PIPE_NONE;
833
unsigned min_dist = ~0u;
834
835
for (unsigned i = 0; i < deps.size(); i++) {
836
if (deps[i].ordered && exec_all >= deps[i].exec_all) {
837
for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
838
const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]);
839
const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10);
840
assert(jp.jp[q] > deps[i].jp.jp[q]);
841
if (dist <= max_dist) {
842
p = (p && IDX(p) != q ? TGL_PIPE_ALL :
843
tgl_pipe(TGL_PIPE_FLOAT + q));
844
min_dist = MIN3(min_dist, dist, 7);
845
}
846
}
847
}
848
}
849
850
return { p ? min_dist : 0, p };
851
}
852
853
/**
854
* Return whether the dependency list \p deps of an instruction with
855
* ordered_address \p jp has any non-trivial ordered dependencies. If \p
856
* exec_all is false only dependencies known to be executed with channel
857
* masking applied will be considered in the calculation.
858
*/
859
bool
860
find_ordered_dependency(const dependency_list &deps,
861
const ordered_address &jp,
862
bool exec_all)
863
{
864
return ordered_dependency_swsb(deps, jp, exec_all).regdist;
865
}
866
867
/**
868
* Return the full tgl_sbid_mode bitset for the first unordered dependency
869
* on the list \p deps that matches the specified tgl_sbid_mode, or zero if
870
* no such dependency is present. If \p exec_all is false only
871
* dependencies known to be executed with channel masking applied will be
872
* considered in the calculation.
873
*/
874
tgl_sbid_mode
875
find_unordered_dependency(const dependency_list &deps,
876
tgl_sbid_mode unordered,
877
bool exec_all)
878
{
879
if (unordered) {
880
for (unsigned i = 0; i < deps.size(); i++) {
881
if ((unordered & deps[i].unordered) &&
882
exec_all >= deps[i].exec_all)
883
return deps[i].unordered;
884
}
885
}
886
887
return TGL_SBID_NULL;
888
}
889
890
/**
891
* Return the tgl_sbid_mode bitset of an unordered dependency from the list
892
* \p deps that can be represented directly in the SWSB annotation of the
893
* instruction without additional SYNC instructions, or zero if no such
894
* dependency is present.
895
*/
896
tgl_sbid_mode
897
baked_unordered_dependency_mode(const struct intel_device_info *devinfo,
898
const fs_inst *inst,
899
const dependency_list &deps,
900
const ordered_address &jp)
901
{
902
const bool exec_all = inst->force_writemask_all;
903
const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
904
const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
905
exec_all).pipe;
906
907
if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all))
908
return find_unordered_dependency(deps, TGL_SBID_SET, exec_all);
909
else if (has_ordered && is_unordered(inst))
910
return TGL_SBID_NULL;
911
else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) &&
912
(!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst)))
913
return find_unordered_dependency(deps, TGL_SBID_DST, exec_all);
914
else if (!has_ordered)
915
return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all);
916
else
917
return TGL_SBID_NULL;
918
}
919
920
/**
921
* Return whether an ordered dependency from the list \p deps can be
922
* represented directly in the SWSB annotation of the instruction without
923
* additional SYNC instructions.
924
*/
925
bool
926
baked_ordered_dependency_mode(const struct intel_device_info *devinfo,
927
const fs_inst *inst,
928
const dependency_list &deps,
929
const ordered_address &jp)
930
{
931
const bool exec_all = inst->force_writemask_all;
932
const bool has_ordered = find_ordered_dependency(deps, jp, exec_all);
933
const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp,
934
exec_all).pipe;
935
const tgl_sbid_mode unordered_mode =
936
baked_unordered_dependency_mode(devinfo, inst, deps, jp);
937
938
if (!has_ordered)
939
return false;
940
else if (!unordered_mode)
941
return true;
942
else
943
return ordered_pipe == inferred_sync_pipe(devinfo, inst) &&
944
unordered_mode == (is_unordered(inst) ? TGL_SBID_SET :
945
TGL_SBID_DST);
946
}
947
948
/** @} */
949
950
/**
951
* Shader instruction dependency calculation.
952
* @{
953
*/
954
955
/**
956
* Update scoreboard object \p sb to account for the execution of
957
* instruction \p inst.
958
*/
959
void
960
update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps,
961
const fs_inst *inst, unsigned ip, scoreboard &sb)
962
{
963
const bool exec_all = inst->force_writemask_all;
964
const struct intel_device_info *devinfo = shader->devinfo;
965
const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
966
const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) :
967
ordered_address();
968
969
/* Track any source registers that may be fetched asynchronously by this
970
* instruction, otherwise clear the dependency in order to avoid
971
* subsequent redundant synchronization.
972
*/
973
for (unsigned i = 0; i < inst->sources; i++) {
974
const dependency rd_dep =
975
(inst->is_payload(i) ||
976
inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) :
977
ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
978
dependency(TGL_REGDIST_SRC, jp, exec_all) :
979
dependency::done;
980
981
for (unsigned j = 0; j < regs_read(inst, i); j++)
982
sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
983
}
984
985
if (inst->reads_accumulator_implicitly())
986
sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
987
988
if (is_send(inst) && inst->base_mrf != -1) {
989
const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);
990
991
for (unsigned j = 0; j < inst->mlen; j++)
992
sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep);
993
}
994
995
/* Track any destination registers of this instruction. */
996
const dependency wr_dep =
997
is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) :
998
ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ?
999
dependency(TGL_REGDIST_DST, jp, exec_all) :
1000
dependency();
1001
1002
if (inst->writes_accumulator_implicitly(devinfo))
1003
sb.set(brw_acc_reg(8), wr_dep);
1004
1005
if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
1006
!inst->dst.is_null()) {
1007
for (unsigned j = 0; j < regs_written(inst); j++)
1008
sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep);
1009
}
1010
}
1011
1012
/**
1013
* Calculate scoreboard objects locally that represent any pending (and
1014
* unconditionally resolved) dependencies at the end of each block of the
1015
* program.
1016
*/
1017
scoreboard *
1018
gather_block_scoreboards(const fs_visitor *shader,
1019
const ordered_address *jps)
1020
{
1021
scoreboard *sbs = new scoreboard[shader->cfg->num_blocks];
1022
unsigned ip = 0;
1023
1024
foreach_block_and_inst(block, fs_inst, inst, shader->cfg)
1025
update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]);
1026
1027
return sbs;
1028
}
1029
1030
/**
1031
* Propagate data dependencies globally through the control flow graph
1032
* until a fixed point is reached.
1033
*
1034
* Calculates the set of dependencies potentially pending at the beginning
1035
* of each block, and returns it as an array of scoreboard objects.
1036
*/
1037
scoreboard *
1038
propagate_block_scoreboards(const fs_visitor *shader,
1039
const ordered_address *jps,
1040
equivalence_relation &eq)
1041
{
1042
const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps);
1043
scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks];
1044
scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks];
1045
1046
for (bool progress = true; progress;) {
1047
progress = false;
1048
1049
foreach_block(block, shader->cfg) {
1050
const scoreboard sb = shadow(in_sbs[block->num],
1051
delta_sbs[block->num]);
1052
1053
if (sb != out_sbs[block->num]) {
1054
foreach_list_typed(bblock_link, child_link, link,
1055
&block->children) {
1056
scoreboard &in_sb = in_sbs[child_link->block->num];
1057
int delta[IDX(TGL_PIPE_ALL)];
1058
1059
for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++)
1060
delta[p] = jps[child_link->block->start_ip].jp[p]
1061
- jps[block->end_ip].jp[p]
1062
- ordered_unit(shader->devinfo,
1063
static_cast<const fs_inst *>(block->end()), p);
1064
1065
in_sb = merge(eq, in_sb, transport(sb, delta));
1066
}
1067
1068
out_sbs[block->num] = sb;
1069
progress = true;
1070
}
1071
}
1072
}
1073
1074
delete[] delta_sbs;
1075
delete[] out_sbs;
1076
1077
return in_sbs;
1078
}
1079
1080
/**
1081
* Return the list of potential dependencies of each instruction in the
1082
* shader based on the result of global dependency analysis.
1083
*/
1084
dependency_list *
1085
gather_inst_dependencies(const fs_visitor *shader,
1086
const ordered_address *jps)
1087
{
1088
const struct intel_device_info *devinfo = shader->devinfo;
1089
equivalence_relation eq(num_instructions(shader));
1090
scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
1091
const unsigned *ids = eq.flatten();
1092
dependency_list *deps = new dependency_list[num_instructions(shader)];
1093
unsigned ip = 0;
1094
1095
foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
1096
const bool exec_all = inst->force_writemask_all;
1097
const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
1098
scoreboard &sb = sbs[block->num];
1099
1100
for (unsigned i = 0; i < inst->sources; i++) {
1101
for (unsigned j = 0; j < regs_read(inst, i); j++)
1102
add_dependency(ids, deps[ip], dependency_for_read(
1103
sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
1104
}
1105
1106
if (inst->reads_accumulator_implicitly()) {
1107
/* Wa_22012725308:
1108
*
1109
* "When the accumulator registers are used as source and/or
1110
* destination, hardware does not ensure prevention of write
1111
* after read hazard across execution pipes."
1112
*/
1113
const dependency dep = sb.get(brw_acc_reg(8));
1114
if (dep.ordered && !is_single_pipe(dep.jp, p))
1115
add_dependency(ids, deps[ip], dep);
1116
}
1117
1118
if (is_send(inst) && inst->base_mrf != -1) {
1119
for (unsigned j = 0; j < inst->mlen; j++)
1120
add_dependency(ids, deps[ip], dependency_for_read(
1121
sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1122
}
1123
1124
if (is_unordered(inst))
1125
add_dependency(ids, deps[ip],
1126
dependency(TGL_SBID_SET, ip, exec_all));
1127
1128
if (!inst->no_dd_check) {
1129
if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
1130
!inst->dst.is_accumulator()) {
1131
for (unsigned j = 0; j < regs_written(inst); j++) {
1132
add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1133
sb.get(byte_offset(inst->dst, REG_SIZE * j))));
1134
}
1135
}
1136
1137
if (inst->writes_accumulator_implicitly(devinfo) ||
1138
inst->dst.is_accumulator()) {
1139
/* Wa_22012725308:
1140
*
1141
* "When the accumulator registers are used as source and/or
1142
* destination, hardware does not ensure prevention of write
1143
* after read hazard across execution pipes."
1144
*/
1145
const dependency dep = sb.get(brw_acc_reg(8));
1146
if (dep.ordered && !is_single_pipe(dep.jp, p))
1147
add_dependency(ids, deps[ip], dep);
1148
}
1149
1150
if (is_send(inst) && inst->base_mrf != -1) {
1151
for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
1152
add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst,
1153
sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0))));
1154
}
1155
}
1156
1157
update_inst_scoreboard(shader, jps, inst, ip, sb);
1158
ip++;
1159
}
1160
1161
delete[] sbs;
1162
delete[] ids;
1163
1164
return deps;
1165
}
1166
1167
/** @} */
1168
1169
/**
1170
* Allocate SBID tokens to track the execution of every out-of-order
1171
* instruction of the shader.
1172
*/
1173
dependency_list *
1174
allocate_inst_dependencies(const fs_visitor *shader,
1175
const dependency_list *deps0)
1176
{
1177
/* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in
1178
* shaders with a large number of SEND messages.
1179
*/
1180
1181
/* Allocate an unordered dependency ID to hardware SBID translation
1182
* table with as many entries as instructions there are in the shader,
1183
* which is the maximum number of unordered IDs we can find in the
1184
* program.
1185
*/
1186
unsigned *ids = new unsigned[num_instructions(shader)];
1187
for (unsigned ip = 0; ip < num_instructions(shader); ip++)
1188
ids[ip] = ~0u;
1189
1190
dependency_list *deps1 = new dependency_list[num_instructions(shader)];
1191
unsigned next_id = 0;
1192
1193
for (unsigned ip = 0; ip < num_instructions(shader); ip++) {
1194
for (unsigned i = 0; i < deps0[ip].size(); i++) {
1195
const dependency &dep = deps0[ip][i];
1196
1197
if (dep.unordered && ids[dep.id] == ~0u)
1198
ids[dep.id] = (next_id++) & 0xf;
1199
1200
add_dependency(ids, deps1[ip], dep);
1201
}
1202
}
1203
1204
delete[] ids;
1205
1206
return deps1;
1207
}
1208
1209
/**
1210
* Emit dependency information provided by \p deps into the shader,
1211
* inserting additional SYNC instructions for dependencies that can't be
1212
* represented directly by annotating existing instructions.
1213
*/
1214
void
1215
emit_inst_dependencies(fs_visitor *shader,
1216
const ordered_address *jps,
1217
const dependency_list *deps)
1218
{
1219
const struct intel_device_info *devinfo = shader->devinfo;
1220
unsigned ip = 0;
1221
1222
foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) {
1223
const bool exec_all = inst->force_writemask_all;
1224
const bool ordered_mode =
1225
baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1226
const tgl_sbid_mode unordered_mode =
1227
baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]);
1228
tgl_swsb swsb = !ordered_mode ? tgl_swsb() :
1229
ordered_dependency_swsb(deps[ip], jps[ip], exec_all);
1230
1231
for (unsigned i = 0; i < deps[ip].size(); i++) {
1232
const dependency &dep = deps[ip][i];
1233
1234
if (dep.unordered) {
1235
if (unordered_mode == dep.unordered &&
1236
exec_all >= dep.exec_all && !swsb.mode) {
1237
/* Bake unordered dependency into the instruction's SWSB if
1238
* possible, except in cases where the current instruction
1239
* isn't marked NoMask but the dependency is, since that
1240
* might lead to data coherency issues due to
1241
* Wa_1407528679.
1242
*/
1243
swsb.sbid = dep.id;
1244
swsb.mode = dep.unordered;
1245
} else {
1246
/* Emit dependency into the SWSB of an extra SYNC
1247
* instruction.
1248
*/
1249
const fs_builder ibld = fs_builder(shader, block, inst)
1250
.exec_all().group(1, 0);
1251
fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1252
brw_imm_ud(TGL_SYNC_NOP));
1253
sync->sched.sbid = dep.id;
1254
sync->sched.mode = dep.unordered;
1255
assert(!(sync->sched.mode & TGL_SBID_SET));
1256
}
1257
}
1258
}
1259
1260
for (unsigned i = 0; i < deps[ip].size(); i++) {
1261
const dependency &dep = deps[ip][i];
1262
1263
if (dep.ordered &&
1264
find_ordered_dependency(deps[ip], jps[ip], true) &&
1265
(!ordered_mode || dep.exec_all > exec_all)) {
1266
/* If the current instruction is not marked NoMask but an
1267
* ordered dependency is, perform the synchronization as a
1268
* separate NoMask SYNC instruction in order to avoid data
1269
* coherency issues due to Wa_1407528679. The similar
1270
* scenario with unordered dependencies should have been
1271
* handled above.
1272
*/
1273
const fs_builder ibld = fs_builder(shader, block, inst)
1274
.exec_all().group(1, 0);
1275
fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(),
1276
brw_imm_ud(TGL_SYNC_NOP));
1277
sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true);
1278
break;
1279
}
1280
}
1281
1282
/* Update the IR. */
1283
inst->sched = swsb;
1284
inst->no_dd_check = inst->no_dd_clear = false;
1285
ip++;
1286
}
1287
}
1288
}
1289
1290
bool
1291
fs_visitor::lower_scoreboard()
1292
{
1293
if (devinfo->ver >= 12) {
1294
const ordered_address *jps = ordered_inst_addresses(this);
1295
const dependency_list *deps0 = gather_inst_dependencies(this, jps);
1296
const dependency_list *deps1 = allocate_inst_dependencies(this, deps0);
1297
emit_inst_dependencies(this, jps, deps1);
1298
delete[] deps1;
1299
delete[] deps0;
1300
delete[] jps;
1301
}
1302
1303
return true;
1304
}
1305
1306