Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_schedule.c
4570 views
1
/*
2
* Copyright © 2010 Intel Corporation
3
* Copyright © 2014 Broadcom
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*/
24
25
/**
26
* @file vc4_qpu_schedule.c
27
*
28
* The basic model of the list scheduler is to take a basic block, compute a
29
* DAG of the dependencies, and make a list of the DAG heads. Heuristically
30
* pick a DAG head, then put all the children that are now DAG heads into the
31
* list of things to schedule.
32
*
33
* The goal of scheduling here is to pack pairs of operations together in a
34
* single QPU instruction.
35
*/
36
37
#include "vc4_qir.h"
38
#include "vc4_qpu.h"
39
#include "util/ralloc.h"
40
#include "util/dag.h"
41
42
static bool debug;
43
44
struct schedule_node_child;
45
46
struct schedule_node {
47
struct dag_node dag;
48
struct list_head link;
49
struct queued_qpu_inst *inst;
50
51
/* Longest cycles + instruction_latency() of any parent of this node. */
52
uint32_t unblocked_time;
53
54
/**
55
* Minimum number of cycles from scheduling this instruction until the
56
* end of the program, based on the slowest dependency chain through
57
* the children.
58
*/
59
uint32_t delay;
60
61
/**
62
* cycles between this instruction being scheduled and when its result
63
* can be consumed.
64
*/
65
uint32_t latency;
66
67
/**
68
* Which uniform from uniform_data[] this instruction read, or -1 if
69
* not reading a uniform.
70
*/
71
int uniform;
72
};
73
74
/* When walking the instructions in reverse, we need to swap before/after in
75
* add_dep().
76
*/
77
enum direction { F, R };
78
79
struct schedule_state {
80
struct dag *dag;
81
struct schedule_node *last_r[6];
82
struct schedule_node *last_ra[32];
83
struct schedule_node *last_rb[32];
84
struct schedule_node *last_sf;
85
struct schedule_node *last_vpm_read;
86
struct schedule_node *last_tmu_write;
87
struct schedule_node *last_tlb;
88
struct schedule_node *last_vpm;
89
struct schedule_node *last_uniforms_reset;
90
enum direction dir;
91
/* Estimated cycle when the current instruction would start. */
92
uint32_t time;
93
};
94
95
static void
96
add_dep(struct schedule_state *state,
97
struct schedule_node *before,
98
struct schedule_node *after,
99
bool write)
100
{
101
bool write_after_read = !write && state->dir == R;
102
void *edge_data = (void *)(uintptr_t)write_after_read;
103
104
if (!before || !after)
105
return;
106
107
assert(before != after);
108
109
if (state->dir == F)
110
dag_add_edge(&before->dag, &after->dag, edge_data);
111
else
112
dag_add_edge(&after->dag, &before->dag, edge_data);
113
}
114
115
static void
116
add_read_dep(struct schedule_state *state,
117
struct schedule_node *before,
118
struct schedule_node *after)
119
{
120
add_dep(state, before, after, false);
121
}
122
123
static void
124
add_write_dep(struct schedule_state *state,
125
struct schedule_node **before,
126
struct schedule_node *after)
127
{
128
add_dep(state, *before, after, true);
129
*before = after;
130
}
131
132
static bool
133
qpu_writes_r4(uint64_t inst)
134
{
135
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
136
137
switch(sig) {
138
case QPU_SIG_COLOR_LOAD:
139
case QPU_SIG_LOAD_TMU0:
140
case QPU_SIG_LOAD_TMU1:
141
case QPU_SIG_ALPHA_MASK_LOAD:
142
return true;
143
default:
144
return false;
145
}
146
}
147
148
static void
149
process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
150
uint32_t raddr, bool is_a)
151
{
152
switch (raddr) {
153
case QPU_R_VARY:
154
add_write_dep(state, &state->last_r[5], n);
155
break;
156
157
case QPU_R_VPM:
158
add_write_dep(state, &state->last_vpm_read, n);
159
break;
160
161
case QPU_R_UNIF:
162
add_read_dep(state, state->last_uniforms_reset, n);
163
break;
164
165
case QPU_R_NOP:
166
case QPU_R_ELEM_QPU:
167
case QPU_R_XY_PIXEL_COORD:
168
case QPU_R_MS_REV_FLAGS:
169
break;
170
171
default:
172
if (raddr < 32) {
173
if (is_a)
174
add_read_dep(state, state->last_ra[raddr], n);
175
else
176
add_read_dep(state, state->last_rb[raddr], n);
177
} else {
178
fprintf(stderr, "unknown raddr %d\n", raddr);
179
abort();
180
}
181
break;
182
}
183
}
184
185
static bool
186
is_tmu_write(uint32_t waddr)
187
{
188
switch (waddr) {
189
case QPU_W_TMU0_S:
190
case QPU_W_TMU0_T:
191
case QPU_W_TMU0_R:
192
case QPU_W_TMU0_B:
193
case QPU_W_TMU1_S:
194
case QPU_W_TMU1_T:
195
case QPU_W_TMU1_R:
196
case QPU_W_TMU1_B:
197
return true;
198
default:
199
return false;
200
}
201
}
202
203
static bool
204
reads_uniform(uint64_t inst)
205
{
206
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
207
return false;
208
209
return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
210
(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
211
QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
212
is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
213
is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
214
}
215
216
static void
217
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
218
uint32_t mux)
219
{
220
if (mux != QPU_MUX_A && mux != QPU_MUX_B)
221
add_read_dep(state, state->last_r[mux], n);
222
}
223
224
225
static void
226
process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
227
uint32_t waddr, bool is_add)
228
{
229
uint64_t inst = n->inst->inst;
230
bool is_a = is_add ^ ((inst & QPU_WS) != 0);
231
232
if (waddr < 32) {
233
if (is_a) {
234
add_write_dep(state, &state->last_ra[waddr], n);
235
} else {
236
add_write_dep(state, &state->last_rb[waddr], n);
237
}
238
} else if (is_tmu_write(waddr)) {
239
add_write_dep(state, &state->last_tmu_write, n);
240
add_read_dep(state, state->last_uniforms_reset, n);
241
} else if (qpu_waddr_is_tlb(waddr) ||
242
waddr == QPU_W_MS_FLAGS) {
243
add_write_dep(state, &state->last_tlb, n);
244
} else {
245
switch (waddr) {
246
case QPU_W_ACC0:
247
case QPU_W_ACC1:
248
case QPU_W_ACC2:
249
case QPU_W_ACC3:
250
case QPU_W_ACC5:
251
add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
252
n);
253
break;
254
255
case QPU_W_VPM:
256
add_write_dep(state, &state->last_vpm, n);
257
break;
258
259
case QPU_W_VPMVCD_SETUP:
260
if (is_a)
261
add_write_dep(state, &state->last_vpm_read, n);
262
else
263
add_write_dep(state, &state->last_vpm, n);
264
break;
265
266
case QPU_W_SFU_RECIP:
267
case QPU_W_SFU_RECIPSQRT:
268
case QPU_W_SFU_EXP:
269
case QPU_W_SFU_LOG:
270
add_write_dep(state, &state->last_r[4], n);
271
break;
272
273
case QPU_W_TLB_STENCIL_SETUP:
274
/* This isn't a TLB operation that does things like
275
* implicitly lock the scoreboard, but it does have to
276
* appear before TLB_Z, and each of the TLB_STENCILs
277
* have to schedule in the same order relative to each
278
* other.
279
*/
280
add_write_dep(state, &state->last_tlb, n);
281
break;
282
283
case QPU_W_MS_FLAGS:
284
add_write_dep(state, &state->last_tlb, n);
285
break;
286
287
case QPU_W_UNIFORMS_ADDRESS:
288
add_write_dep(state, &state->last_uniforms_reset, n);
289
break;
290
291
case QPU_W_NOP:
292
break;
293
294
default:
295
fprintf(stderr, "Unknown waddr %d\n", waddr);
296
abort();
297
}
298
}
299
}
300
301
static void
302
process_cond_deps(struct schedule_state *state, struct schedule_node *n,
303
uint32_t cond)
304
{
305
switch (cond) {
306
case QPU_COND_NEVER:
307
case QPU_COND_ALWAYS:
308
break;
309
default:
310
add_read_dep(state, state->last_sf, n);
311
break;
312
}
313
}
314
315
/**
316
* Common code for dependencies that need to be tracked both forward and
317
* backward.
318
*
319
* This is for things like "all reads of r4 have to happen between the r4
320
* writes that surround them".
321
*/
322
static void
323
calculate_deps(struct schedule_state *state, struct schedule_node *n)
324
{
325
uint64_t inst = n->inst->inst;
326
uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
327
uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
328
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
329
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
330
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
331
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
332
uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
333
uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
334
uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
335
uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
336
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
337
338
if (sig != QPU_SIG_LOAD_IMM) {
339
process_raddr_deps(state, n, raddr_a, true);
340
if (sig != QPU_SIG_SMALL_IMM &&
341
sig != QPU_SIG_BRANCH)
342
process_raddr_deps(state, n, raddr_b, false);
343
}
344
345
if (add_op != QPU_A_NOP) {
346
process_mux_deps(state, n, add_a);
347
process_mux_deps(state, n, add_b);
348
}
349
if (mul_op != QPU_M_NOP) {
350
process_mux_deps(state, n, mul_a);
351
process_mux_deps(state, n, mul_b);
352
}
353
354
process_waddr_deps(state, n, waddr_add, true);
355
process_waddr_deps(state, n, waddr_mul, false);
356
if (qpu_writes_r4(inst))
357
add_write_dep(state, &state->last_r[4], n);
358
359
switch (sig) {
360
case QPU_SIG_SW_BREAKPOINT:
361
case QPU_SIG_NONE:
362
case QPU_SIG_SMALL_IMM:
363
case QPU_SIG_LOAD_IMM:
364
break;
365
366
case QPU_SIG_THREAD_SWITCH:
367
case QPU_SIG_LAST_THREAD_SWITCH:
368
/* All accumulator contents and flags are undefined after the
369
* switch.
370
*/
371
for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
372
add_write_dep(state, &state->last_r[i], n);
373
add_write_dep(state, &state->last_sf, n);
374
375
/* Scoreboard-locking operations have to stay after the last
376
* thread switch.
377
*/
378
add_write_dep(state, &state->last_tlb, n);
379
380
add_write_dep(state, &state->last_tmu_write, n);
381
break;
382
383
case QPU_SIG_LOAD_TMU0:
384
case QPU_SIG_LOAD_TMU1:
385
/* TMU loads are coming from a FIFO, so ordering is important.
386
*/
387
add_write_dep(state, &state->last_tmu_write, n);
388
break;
389
390
case QPU_SIG_COLOR_LOAD:
391
add_read_dep(state, state->last_tlb, n);
392
break;
393
394
case QPU_SIG_BRANCH:
395
add_read_dep(state, state->last_sf, n);
396
break;
397
398
case QPU_SIG_PROG_END:
399
case QPU_SIG_WAIT_FOR_SCOREBOARD:
400
case QPU_SIG_SCOREBOARD_UNLOCK:
401
case QPU_SIG_COVERAGE_LOAD:
402
case QPU_SIG_COLOR_LOAD_END:
403
case QPU_SIG_ALPHA_MASK_LOAD:
404
fprintf(stderr, "Unhandled signal bits %d\n", sig);
405
abort();
406
}
407
408
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
409
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
410
if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
411
add_write_dep(state, &state->last_sf, n);
412
}
413
414
static void
415
calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
416
struct list_head *schedule_list)
417
{
418
struct schedule_state state;
419
420
memset(&state, 0, sizeof(state));
421
state.dag = dag;
422
state.dir = F;
423
424
list_for_each_entry(struct schedule_node, node, schedule_list, link)
425
calculate_deps(&state, node);
426
}
427
428
static void
429
calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
430
struct list_head *schedule_list)
431
{
432
struct schedule_state state;
433
434
memset(&state, 0, sizeof(state));
435
state.dag = dag;
436
state.dir = R;
437
438
list_for_each_entry_rev(struct schedule_node, node, schedule_list,
439
link) {
440
calculate_deps(&state, (struct schedule_node *)node);
441
}
442
}
443
444
struct choose_scoreboard {
445
struct dag *dag;
446
int tick;
447
int last_sfu_write_tick;
448
int last_uniforms_reset_tick;
449
uint32_t last_waddr_a, last_waddr_b;
450
bool tlb_locked;
451
};
452
453
static bool
454
reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
455
{
456
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
457
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
458
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
459
460
/* Full immediate loads don't read any registers. */
461
if (sig == QPU_SIG_LOAD_IMM)
462
return false;
463
464
uint32_t src_muxes[] = {
465
QPU_GET_FIELD(inst, QPU_ADD_A),
466
QPU_GET_FIELD(inst, QPU_ADD_B),
467
QPU_GET_FIELD(inst, QPU_MUL_A),
468
QPU_GET_FIELD(inst, QPU_MUL_B),
469
};
470
for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
471
if ((src_muxes[i] == QPU_MUX_A &&
472
raddr_a < 32 &&
473
scoreboard->last_waddr_a == raddr_a) ||
474
(src_muxes[i] == QPU_MUX_B &&
475
sig != QPU_SIG_SMALL_IMM &&
476
raddr_b < 32 &&
477
scoreboard->last_waddr_b == raddr_b)) {
478
return true;
479
}
480
481
if (src_muxes[i] == QPU_MUX_R4) {
482
if (scoreboard->tick -
483
scoreboard->last_sfu_write_tick <= 2) {
484
return true;
485
}
486
}
487
}
488
489
if (sig == QPU_SIG_SMALL_IMM &&
490
QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
491
uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
492
uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
493
494
if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
495
scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
496
scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
497
scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
498
return true;
499
}
500
}
501
502
if (reads_uniform(inst) &&
503
scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
504
return true;
505
}
506
507
return false;
508
}
509
510
static bool
511
pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
512
{
513
return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
514
}
515
516
static int
517
get_instruction_priority(uint64_t inst)
518
{
519
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
520
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
521
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
522
uint32_t baseline_score;
523
uint32_t next_score = 0;
524
525
/* Schedule TLB operations as late as possible, to get more
526
* parallelism between shaders.
527
*/
528
if (qpu_inst_is_tlb(inst))
529
return next_score;
530
next_score++;
531
532
/* Schedule texture read results collection late to hide latency. */
533
if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
534
return next_score;
535
next_score++;
536
537
/* Default score for things that aren't otherwise special. */
538
baseline_score = next_score;
539
next_score++;
540
541
/* Schedule texture read setup early to hide their latency better. */
542
if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
543
return next_score;
544
next_score++;
545
546
return baseline_score;
547
}
548
549
static struct schedule_node *
550
choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
551
struct list_head *schedule_list,
552
struct schedule_node *prev_inst)
553
{
554
struct schedule_node *chosen = NULL;
555
int chosen_prio = 0;
556
557
/* Don't pair up anything with a thread switch signal -- emit_thrsw()
558
* will handle pairing it along with filling the delay slots.
559
*/
560
if (prev_inst) {
561
uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
562
QPU_SIG);
563
if (prev_sig == QPU_SIG_THREAD_SWITCH ||
564
prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
565
return NULL;
566
}
567
}
568
569
list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
570
dag.link) {
571
uint64_t inst = n->inst->inst;
572
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
573
574
/* Don't choose the branch instruction until it's the last one
575
* left. XXX: We could potentially choose it before it's the
576
* last one, if the remaining instructions fit in the delay
577
* slots.
578
*/
579
if (sig == QPU_SIG_BRANCH &&
580
!list_is_singular(&scoreboard->dag->heads)) {
581
continue;
582
}
583
584
/* "An instruction must not read from a location in physical
585
* regfile A or B that was written to by the previous
586
* instruction."
587
*/
588
if (reads_too_soon_after_write(scoreboard, inst))
589
continue;
590
591
/* "A scoreboard wait must not occur in the first two
592
* instructions of a fragment shader. This is either the
593
* explicit Wait for Scoreboard signal or an implicit wait
594
* with the first tile-buffer read or write instruction."
595
*/
596
if (pixel_scoreboard_too_soon(scoreboard, inst))
597
continue;
598
599
/* If we're trying to pair with another instruction, check
600
* that they're compatible.
601
*/
602
if (prev_inst) {
603
/* Don't pair up a thread switch signal -- we'll
604
* handle pairing it when we pick it on its own.
605
*/
606
if (sig == QPU_SIG_THREAD_SWITCH ||
607
sig == QPU_SIG_LAST_THREAD_SWITCH) {
608
continue;
609
}
610
611
if (prev_inst->uniform != -1 && n->uniform != -1)
612
continue;
613
614
/* Don't merge in something that will lock the TLB.
615
* Hopefully what we have in inst will release some
616
* other instructions, allowing us to delay the
617
* TLB-locking instruction until later.
618
*/
619
if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
620
continue;
621
622
inst = qpu_merge_inst(prev_inst->inst->inst, inst);
623
if (!inst)
624
continue;
625
}
626
627
int prio = get_instruction_priority(inst);
628
629
/* Found a valid instruction. If nothing better comes along,
630
* this one works.
631
*/
632
if (!chosen) {
633
chosen = n;
634
chosen_prio = prio;
635
continue;
636
}
637
638
if (prio > chosen_prio) {
639
chosen = n;
640
chosen_prio = prio;
641
} else if (prio < chosen_prio) {
642
continue;
643
}
644
645
if (n->delay > chosen->delay) {
646
chosen = n;
647
chosen_prio = prio;
648
} else if (n->delay < chosen->delay) {
649
continue;
650
}
651
}
652
653
return chosen;
654
}
655
656
static void
657
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
658
uint64_t inst)
659
{
660
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
661
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
662
663
if (!(inst & QPU_WS)) {
664
scoreboard->last_waddr_a = waddr_add;
665
scoreboard->last_waddr_b = waddr_mul;
666
} else {
667
scoreboard->last_waddr_b = waddr_add;
668
scoreboard->last_waddr_a = waddr_mul;
669
}
670
671
if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
672
(waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
673
scoreboard->last_sfu_write_tick = scoreboard->tick;
674
}
675
676
if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
677
waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
678
scoreboard->last_uniforms_reset_tick = scoreboard->tick;
679
}
680
681
if (qpu_inst_is_tlb(inst))
682
scoreboard->tlb_locked = true;
683
}
684
685
static void
686
dump_state(struct dag *dag)
687
{
688
list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
689
fprintf(stderr, " t=%4d: ", n->unblocked_time);
690
vc4_qpu_disasm(&n->inst->inst, 1);
691
fprintf(stderr, "\n");
692
693
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
694
struct schedule_node *child =
695
(struct schedule_node *)edge->child;
696
if (!child)
697
continue;
698
699
fprintf(stderr, " - ");
700
vc4_qpu_disasm(&child->inst->inst, 1);
701
fprintf(stderr, " (%d parents, %c)\n",
702
child->dag.parent_count,
703
edge->data ? 'w' : 'r');
704
}
705
}
706
}
707
708
static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
709
{
710
if (waddr < 32)
711
return 2;
712
713
/* Apply some huge latency between texture fetch requests and getting
714
* their results back.
715
*
716
* FIXME: This is actually pretty bogus. If we do:
717
*
718
* mov tmu0_s, a
719
* <a bit of math>
720
* mov tmu0_s, b
721
* load_tmu0
722
* <more math>
723
* load_tmu0
724
*
725
* we count that as worse than
726
*
727
* mov tmu0_s, a
728
* mov tmu0_s, b
729
* <lots of math>
730
* load_tmu0
731
* <more math>
732
* load_tmu0
733
*
734
* because we associate the first load_tmu0 with the *second* tmu0_s.
735
*/
736
if (waddr == QPU_W_TMU0_S) {
737
if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
738
return 100;
739
}
740
if (waddr == QPU_W_TMU1_S) {
741
if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
742
return 100;
743
}
744
745
switch(waddr) {
746
case QPU_W_SFU_RECIP:
747
case QPU_W_SFU_RECIPSQRT:
748
case QPU_W_SFU_EXP:
749
case QPU_W_SFU_LOG:
750
return 3;
751
default:
752
return 1;
753
}
754
}
755
756
static uint32_t
757
instruction_latency(struct schedule_node *before, struct schedule_node *after)
758
{
759
uint64_t before_inst = before->inst->inst;
760
uint64_t after_inst = after->inst->inst;
761
762
return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
763
after_inst),
764
waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
765
after_inst));
766
}
767
768
/** Recursive computation of the delay member of a node. */
769
static void
770
compute_delay(struct dag_node *node, void *state)
771
{
772
struct schedule_node *n = (struct schedule_node *)node;
773
774
n->delay = 1;
775
776
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
777
struct schedule_node *child =
778
(struct schedule_node *)edge->child;
779
n->delay = MAX2(n->delay, (child->delay +
780
instruction_latency(n, child)));
781
}
782
}
783
784
/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
785
* should be called on it later to finish pruning the other edges).
786
*/
787
static void
788
pre_remove_head(struct dag *dag, struct schedule_node *n)
789
{
790
list_delinit(&n->dag.link);
791
792
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
793
if (edge->data)
794
dag_remove_edge(dag, edge);
795
}
796
}
797
798
static void
799
mark_instruction_scheduled(struct dag *dag,
800
uint32_t time,
801
struct schedule_node *node)
802
{
803
if (!node)
804
return;
805
806
util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
807
struct schedule_node *child =
808
(struct schedule_node *)edge->child;
809
810
if (!child)
811
continue;
812
813
uint32_t latency = instruction_latency(node, child);
814
815
child->unblocked_time = MAX2(child->unblocked_time,
816
time + latency);
817
}
818
dag_prune_head(dag, &node->dag);
819
}
820
821
/**
822
* Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
823
* with another instruction.
824
*/
825
static void
826
emit_thrsw(struct vc4_compile *c,
827
struct choose_scoreboard *scoreboard,
828
uint64_t inst)
829
{
830
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
831
832
/* There should be nothing in a thrsw inst being scheduled other than
833
* the signal bits.
834
*/
835
assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
836
assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
837
838
/* Try to find an earlier scheduled instruction that we can merge the
839
* thrsw into.
840
*/
841
int thrsw_ip = c->qpu_inst_count;
842
for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
843
uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
844
uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
845
846
if (prev_sig == QPU_SIG_NONE)
847
thrsw_ip = c->qpu_inst_count - i;
848
}
849
850
if (thrsw_ip != c->qpu_inst_count) {
851
/* Merge the thrsw into the existing instruction. */
852
c->qpu_insts[thrsw_ip] =
853
QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
854
} else {
855
qpu_serialize_one_inst(c, inst);
856
update_scoreboard_for_chosen(scoreboard, inst);
857
}
858
859
/* Fill the delay slots. */
860
while (c->qpu_inst_count < thrsw_ip + 3) {
861
update_scoreboard_for_chosen(scoreboard, qpu_NOP());
862
qpu_serialize_one_inst(c, qpu_NOP());
863
}
864
}
865
866
static uint32_t
867
schedule_instructions(struct vc4_compile *c,
868
struct choose_scoreboard *scoreboard,
869
struct qblock *block,
870
struct list_head *schedule_list,
871
enum quniform_contents *orig_uniform_contents,
872
uint32_t *orig_uniform_data,
873
uint32_t *next_uniform)
874
{
875
uint32_t time = 0;
876
877
while (!list_is_empty(&scoreboard->dag->heads)) {
878
struct schedule_node *chosen =
879
choose_instruction_to_schedule(scoreboard,
880
schedule_list,
881
NULL);
882
struct schedule_node *merge = NULL;
883
884
/* If there are no valid instructions to schedule, drop a NOP
885
* in.
886
*/
887
uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
888
889
if (debug) {
890
fprintf(stderr, "t=%4d: current list:\n",
891
time);
892
dump_state(scoreboard->dag);
893
fprintf(stderr, "t=%4d: chose: ", time);
894
vc4_qpu_disasm(&inst, 1);
895
fprintf(stderr, "\n");
896
}
897
898
/* Schedule this instruction onto the QPU list. Also try to
899
* find an instruction to pair with it.
900
*/
901
if (chosen) {
902
time = MAX2(chosen->unblocked_time, time);
903
pre_remove_head(scoreboard->dag, chosen);
904
if (chosen->uniform != -1) {
905
c->uniform_data[*next_uniform] =
906
orig_uniform_data[chosen->uniform];
907
c->uniform_contents[*next_uniform] =
908
orig_uniform_contents[chosen->uniform];
909
(*next_uniform)++;
910
}
911
912
merge = choose_instruction_to_schedule(scoreboard,
913
schedule_list,
914
chosen);
915
if (merge) {
916
time = MAX2(merge->unblocked_time, time);
917
inst = qpu_merge_inst(inst, merge->inst->inst);
918
assert(inst != 0);
919
if (merge->uniform != -1) {
920
c->uniform_data[*next_uniform] =
921
orig_uniform_data[merge->uniform];
922
c->uniform_contents[*next_uniform] =
923
orig_uniform_contents[merge->uniform];
924
(*next_uniform)++;
925
}
926
927
if (debug) {
928
fprintf(stderr, "t=%4d: merging: ",
929
time);
930
vc4_qpu_disasm(&merge->inst->inst, 1);
931
fprintf(stderr, "\n");
932
fprintf(stderr, " resulting in: ");
933
vc4_qpu_disasm(&inst, 1);
934
fprintf(stderr, "\n");
935
}
936
}
937
}
938
939
if (debug) {
940
fprintf(stderr, "\n");
941
}
942
943
/* Now that we've scheduled a new instruction, some of its
944
* children can be promoted to the list of instructions ready to
945
* be scheduled. Update the children's unblocked time for this
946
* DAG edge as we do so.
947
*/
948
mark_instruction_scheduled(scoreboard->dag, time, chosen);
949
mark_instruction_scheduled(scoreboard->dag, time, merge);
950
951
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
952
QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
953
emit_thrsw(c, scoreboard, inst);
954
} else {
955
qpu_serialize_one_inst(c, inst);
956
update_scoreboard_for_chosen(scoreboard, inst);
957
}
958
959
scoreboard->tick++;
960
time++;
961
962
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
963
block->branch_qpu_ip = c->qpu_inst_count - 1;
964
/* Fill the delay slots.
965
*
966
* We should fill these with actual instructions,
967
* instead, but that will probably need to be done
968
* after this, once we know what the leading
969
* instructions of the successors are (so we can
970
* handle A/B register file write latency)
971
*/
972
inst = qpu_NOP();
973
update_scoreboard_for_chosen(scoreboard, inst);
974
qpu_serialize_one_inst(c, inst);
975
qpu_serialize_one_inst(c, inst);
976
qpu_serialize_one_inst(c, inst);
977
}
978
}
979
980
return time;
981
}
982
983
static uint32_t
984
qpu_schedule_instructions_block(struct vc4_compile *c,
985
struct choose_scoreboard *scoreboard,
986
struct qblock *block,
987
enum quniform_contents *orig_uniform_contents,
988
uint32_t *orig_uniform_data,
989
uint32_t *next_uniform)
990
{
991
scoreboard->dag = dag_create(NULL);
992
struct list_head setup_list;
993
994
list_inithead(&setup_list);
995
996
/* Wrap each instruction in a scheduler structure. */
997
uint32_t next_sched_uniform = *next_uniform;
998
while (!list_is_empty(&block->qpu_inst_list)) {
999
struct queued_qpu_inst *inst =
1000
(struct queued_qpu_inst *)block->qpu_inst_list.next;
1001
struct schedule_node *n = rzalloc(scoreboard->dag,
1002
struct schedule_node);
1003
1004
dag_init_node(scoreboard->dag, &n->dag);
1005
n->inst = inst;
1006
1007
if (reads_uniform(inst->inst)) {
1008
n->uniform = next_sched_uniform++;
1009
} else {
1010
n->uniform = -1;
1011
}
1012
list_del(&inst->link);
1013
list_addtail(&n->link, &setup_list);
1014
}
1015
1016
calculate_forward_deps(c, scoreboard->dag, &setup_list);
1017
calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1018
1019
dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1020
1021
uint32_t cycles = schedule_instructions(c, scoreboard, block,
1022
&setup_list,
1023
orig_uniform_contents,
1024
orig_uniform_data,
1025
next_uniform);
1026
1027
ralloc_free(scoreboard->dag);
1028
scoreboard->dag = NULL;
1029
1030
return cycles;
1031
}
1032
1033
static void
1034
qpu_set_branch_targets(struct vc4_compile *c)
1035
{
1036
qir_for_each_block(block, c) {
1037
/* The end block of the program has no branch. */
1038
if (!block->successors[0])
1039
continue;
1040
1041
/* If there was no branch instruction, then the successor
1042
* block must follow immediately after this one.
1043
*/
1044
if (block->branch_qpu_ip == ~0) {
1045
assert(block->end_qpu_ip + 1 ==
1046
block->successors[0]->start_qpu_ip);
1047
continue;
1048
}
1049
1050
/* Set the branch target for the block that doesn't follow
1051
* immediately after ours.
1052
*/
1053
uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
1054
assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
1055
assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
1056
1057
uint32_t branch_target =
1058
(block->successors[0]->start_qpu_ip -
1059
(block->branch_qpu_ip + 4)) * sizeof(uint64_t);
1060
*branch_inst = (*branch_inst |
1061
QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
1062
1063
/* Make sure that the if-we-don't-jump successor was scheduled
1064
* just after the delay slots.
1065
*/
1066
if (block->successors[1]) {
1067
assert(block->successors[1]->start_qpu_ip ==
1068
block->branch_qpu_ip + 4);
1069
}
1070
}
1071
}
1072
1073
uint32_t
1074
qpu_schedule_instructions(struct vc4_compile *c)
1075
{
1076
/* We reorder the uniforms as we schedule instructions, so save the
1077
* old data off and replace it.
1078
*/
1079
uint32_t *uniform_data = c->uniform_data;
1080
enum quniform_contents *uniform_contents = c->uniform_contents;
1081
c->uniform_contents = ralloc_array(c, enum quniform_contents,
1082
c->num_uniforms);
1083
c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1084
c->uniform_array_size = c->num_uniforms;
1085
uint32_t next_uniform = 0;
1086
1087
struct choose_scoreboard scoreboard;
1088
memset(&scoreboard, 0, sizeof(scoreboard));
1089
scoreboard.last_waddr_a = ~0;
1090
scoreboard.last_waddr_b = ~0;
1091
scoreboard.last_sfu_write_tick = -10;
1092
scoreboard.last_uniforms_reset_tick = -10;
1093
1094
if (debug) {
1095
fprintf(stderr, "Pre-schedule instructions\n");
1096
qir_for_each_block(block, c) {
1097
fprintf(stderr, "BLOCK %d\n", block->index);
1098
list_for_each_entry(struct queued_qpu_inst, q,
1099
&block->qpu_inst_list, link) {
1100
vc4_qpu_disasm(&q->inst, 1);
1101
fprintf(stderr, "\n");
1102
}
1103
}
1104
fprintf(stderr, "\n");
1105
}
1106
1107
uint32_t cycles = 0;
1108
qir_for_each_block(block, c) {
1109
block->start_qpu_ip = c->qpu_inst_count;
1110
block->branch_qpu_ip = ~0;
1111
1112
cycles += qpu_schedule_instructions_block(c,
1113
&scoreboard,
1114
block,
1115
uniform_contents,
1116
uniform_data,
1117
&next_uniform);
1118
1119
block->end_qpu_ip = c->qpu_inst_count - 1;
1120
}
1121
1122
qpu_set_branch_targets(c);
1123
1124
assert(next_uniform == c->num_uniforms);
1125
1126
if (debug) {
1127
fprintf(stderr, "Post-schedule instructions\n");
1128
vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1129
fprintf(stderr, "\n");
1130
}
1131
1132
return cycles;
1133
}
1134
1135