CoCalc -- vc4_qir

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qir_schedule.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright © 2010 Intel Corporation
3
 * Copyright © 2014-2015 Broadcom
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
/**
26
 * @file vc4_qir_schedule.c
27
 *
28
 * The basic model of the list scheduler is to take a basic block, compute a
29
 * DAG of the dependencies from the bottom up, and make a list of the DAG
30
 * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
31
 * all the parents that are now DAG heads into the list of things to
32
 * schedule.
33
 *
34
 * The goal of scheduling here, before register allocation and conversion to
35
 * QPU instructions, is to reduce register pressure by reordering instructions
36
 * to consume values when possible.
37
 */
38

39
#include "vc4_qir.h"
40
#include "util/dag.h"
41

42
static bool debug;
43

44
struct schedule_node {
45
        struct dag_node dag;
46
        struct list_head link;
47
        struct qinst *inst;
48

49
        /* Length of the longest (latency) chain from a DAG head to the this
50
         * instruction.
51
         */
52
        uint32_t delay;
53

54
        /* Longest time + latency_between(parent, this) of any parent of this
55
         * node.
56
         */
57
        uint32_t unblocked_time;
58
};
59

60
struct schedule_state {
61
        struct dag *dag;
62

63
        uint32_t time;
64

65
        uint32_t *temp_writes;
66

67
        BITSET_WORD *temp_live;
68
};
69

70
/* When walking the instructions in reverse, we need to swap before/after in
71
 * add_dep().
72
 */
73
enum direction { F, R };
74

75
/**
76
 * Marks a dependency between two intructions, that \p after must appear after
77
 * \p before.
78
 *
79
 * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
80
 * the latest instructions with nothing left to schedule are the DAG heads,
81
 * and their inputs are their children.
82
 */
83
static void
84
add_dep(enum direction dir,
85
        struct schedule_node *before,
86
        struct schedule_node *after)
87
{
88
        if (!before || !after)
89
                return;
90

91
        assert(before != after);
92

93
        if (dir == R) {
94
                struct schedule_node *t = before;
95
                before = after;
96
                after = t;
97
        }
98

99
        dag_add_edge(&after->dag, &before->dag, NULL);
100
}
101

102
static void
103
add_write_dep(enum direction dir,
104
              struct schedule_node **before,
105
              struct schedule_node *after)
106
{
107
        add_dep(dir, *before, after);
108
        *before = after;
109
}
110

111
struct schedule_setup_state {
112
        struct schedule_node **last_temp_write;
113
        struct schedule_node *last_sf;
114
        struct schedule_node *last_vary_read;
115
        struct schedule_node *last_vpm_read;
116
        struct schedule_node *last_vpm_write;
117
        struct schedule_node *last_tex_coord;
118
        struct schedule_node *last_tex_result;
119
        struct schedule_node *last_tlb;
120
        struct schedule_node *last_uniforms_reset;
121
        enum direction dir;
122

123
	/**
124
         * Texture FIFO tracking.  This is done top-to-bottom, and is used to
125
         * track the QOP_TEX_RESULTs and add dependencies on previous ones
126
         * when trying to submit texture coords with TFREQ full or new texture
127
         * fetches with TXRCV full.
128
         */
129
        struct {
130
                struct schedule_node *node;
131
                int coords;
132
        } tex_fifo[8];
133
        int tfreq_count; /**< Number of texture coords outstanding. */
134
        int tfrcv_count; /**< Number of texture results outstanding. */
135
        int tex_fifo_pos;
136
};
137

138
static void
139
block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
140
{
141
        add_dep(state->dir, state->tex_fifo[0].node, n);
142

143
        state->tfreq_count -= state->tex_fifo[0].coords;
144
        state->tfrcv_count--;
145

146
        memmove(&state->tex_fifo[0],
147
                &state->tex_fifo[1],
148
                state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
149
        state->tex_fifo_pos--;
150
}
151

152
/**
153
 * Common code for dependencies that need to be tracked both forward and
154
 * backward.
155
 *
156
 * This is for things like "all VPM reads have to happen in order."
157
 */
158
static void
159
calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
160
{
161
        struct qinst *inst = n->inst;
162
        enum direction dir = state->dir;
163

164

165
        /* Add deps for temp registers and varyings accesses.  Note that we
166
         * ignore uniforms accesses, because qir_reorder_uniforms() happens
167
         * after this.
168
         */
169
        for (int i = 0; i < qir_get_nsrc(inst); i++) {
170
                switch (inst->src[i].file) {
171
                case QFILE_TEMP:
172
                        add_dep(dir,
173
                                state->last_temp_write[inst->src[i].index], n);
174
                        break;
175

176
                case QFILE_VARY:
177
                        add_write_dep(dir, &state->last_vary_read, n);
178
                        break;
179

180
                case QFILE_VPM:
181
                        add_write_dep(dir, &state->last_vpm_read, n);
182
                        break;
183

184
                default:
185
                        break;
186
                }
187
        }
188

189
        switch (inst->op) {
190
        case QOP_VARY_ADD_C:
191
                add_dep(dir, state->last_vary_read, n);
192
                break;
193

194
        case QOP_TEX_RESULT:
195
                /* Results have to be fetched in order. */
196
                add_write_dep(dir, &state->last_tex_result, n);
197
                break;
198

199
        case QOP_THRSW:
200
                /* After a new THRSW, one must collect all texture samples
201
                 * queued since the previous THRSW/program start.  For now, we
202
                 * have one THRSW in between each texture setup and its
203
                 * results collection as our input, and we just make sure that
204
                 * that ordering is maintained.
205
                 */
206
                add_write_dep(dir, &state->last_tex_coord, n);
207
                add_write_dep(dir, &state->last_tex_result, n);
208

209
                /* accumulators and flags are lost across thread switches. */
210
                add_write_dep(dir, &state->last_sf, n);
211

212
                /* Setup, like the varyings, will need to be drained before we
213
                 * thread switch.
214
                 */
215
                add_write_dep(dir, &state->last_vary_read, n);
216

217
                /* The TLB-locking operations have to stay after the last
218
                 * thread switch.
219
                 */
220
                add_write_dep(dir, &state->last_tlb, n);
221
                break;
222

223
        case QOP_TLB_COLOR_READ:
224
        case QOP_MS_MASK:
225
                add_write_dep(dir, &state->last_tlb, n);
226
                break;
227

228
        default:
229
                break;
230
        }
231

232
        switch (inst->dst.file) {
233
        case QFILE_VPM:
234
                add_write_dep(dir, &state->last_vpm_write, n);
235
                break;
236

237
        case QFILE_TEMP:
238
                add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
239
                break;
240

241
        case QFILE_TLB_COLOR_WRITE:
242
        case QFILE_TLB_COLOR_WRITE_MS:
243
        case QFILE_TLB_Z_WRITE:
244
        case QFILE_TLB_STENCIL_SETUP:
245
                add_write_dep(dir, &state->last_tlb, n);
246
                break;
247

248
        case QFILE_TEX_S_DIRECT:
249
        case QFILE_TEX_S:
250
        case QFILE_TEX_T:
251
        case QFILE_TEX_R:
252
        case QFILE_TEX_B:
253
                /* Texturing setup gets scheduled in order, because
254
                 * the uniforms referenced by them have to land in a
255
                 * specific order.
256
                 */
257
                add_write_dep(dir, &state->last_tex_coord, n);
258
                break;
259

260
        default:
261
                break;
262
        }
263

264
        if (qir_depends_on_flags(inst))
265
                add_dep(dir, state->last_sf, n);
266

267
        if (inst->sf)
268
                add_write_dep(dir, &state->last_sf, n);
269
}
270

271
static void
272
calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
273
                       struct list_head *schedule_list)
274
{
275
        struct schedule_setup_state state;
276

277
        memset(&state, 0, sizeof(state));
278
        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
279
                                              c->num_temps);
280
        state.dir = F;
281

282
        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
283
                struct qinst *inst = n->inst;
284

285
                calculate_deps(&state, n);
286

287
                for (int i = 0; i < qir_get_nsrc(inst); i++) {
288
                        switch (inst->src[i].file) {
289
                        case QFILE_UNIF:
290
                                add_dep(state.dir, state.last_uniforms_reset, n);
291
                                break;
292
                        default:
293
                                break;
294
                        }
295
                }
296

297
                switch (inst->dst.file) {
298
                case QFILE_TEX_S_DIRECT:
299
                case QFILE_TEX_S:
300
                case QFILE_TEX_T:
301
                case QFILE_TEX_R:
302
                case QFILE_TEX_B:
303
                        /* From the VC4 spec:
304
                         *
305
                         *     "The TFREQ input FIFO holds two full lots of s,
306
                         *      t, r, b data, plus associated setup data, per
307
                         *      QPU, that is, there are eight data slots. For
308
                         *      each texture request, slots are only consumed
309
                         *      for the components of s, t, r, and b actually
310
                         *      written. Thus the FIFO can hold four requests
311
                         *      of just (s, t) data, or eight requests of just
312
                         *      s data (for direct addressed data lookups).
313
                         *
314
                         *      Note that there is one FIFO per QPU, and the
315
                         *      FIFO has no concept of threads - that is,
316
                         *      multi-threaded shaders must be careful to use
317
                         *      only 1/2 the FIFO depth before reading
318
                         *      back. Multi-threaded programs must also
319
                         *      therefore always thread switch on texture
320
                         *      fetch as the other thread may have data
321
                         *      waiting in the FIFO."
322
                         *
323
                         * If the texture coordinate fifo is full, block this
324
                         * on the last QOP_TEX_RESULT.
325
                         */
326
                        if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
327
                                block_until_tex_result(&state, n);
328
                        }
329

330
                        /* From the VC4 spec:
331
                         *
332
                         *     "Since the maximum number of texture requests
333
                         *      in the input (TFREQ) FIFO is four lots of (s,
334
                         *      t) data, the output (TFRCV) FIFO is sized to
335
                         *      holds four lots of max-size color data per
336
                         *      QPU. For non-float color, reads are packed
337
                         *      RGBA8888 data (one read per pixel). For 16-bit
338
                         *      float color, two reads are necessary per
339
                         *      pixel, with reads packed as RG1616 then
340
                         *      BA1616. So per QPU there are eight color slots
341
                         *      in the TFRCV FIFO."
342
                         *
343
                         * If the texture result fifo is full, block adding
344
                         * any more to it until the last QOP_TEX_RESULT.
345
                         */
346
                        if (inst->dst.file == QFILE_TEX_S ||
347
                            inst->dst.file == QFILE_TEX_S_DIRECT) {
348
                                if (state.tfrcv_count ==
349
                                    (c->fs_threaded ? 2 : 4))
350
                                        block_until_tex_result(&state, n);
351
                                state.tfrcv_count++;
352
                        }
353

354
                        state.tex_fifo[state.tex_fifo_pos].coords++;
355
                        state.tfreq_count++;
356
                        break;
357

358
                default:
359
                        break;
360
                }
361

362
                switch (inst->op) {
363
                case QOP_TEX_RESULT:
364
                        /* Results have to be fetched after the
365
                         * coordinate setup.  Note that we're assuming
366
                         * here that our input shader has the texture
367
                         * coord setup and result fetch in order,
368
                         * which is true initially but not of our
369
                         * instruction stream after this pass.
370
                         */
371
                        add_dep(state.dir, state.last_tex_coord, n);
372

373
                        state.tex_fifo[state.tex_fifo_pos].node = n;
374

375
                        state.tex_fifo_pos++;
376
                        memset(&state.tex_fifo[state.tex_fifo_pos], 0,
377
                               sizeof(state.tex_fifo[0]));
378
                        break;
379

380
                case QOP_UNIFORMS_RESET:
381
                        add_write_dep(state.dir, &state.last_uniforms_reset, n);
382
                        break;
383

384
                default:
385
                        break;
386
                }
387
        }
388
}
389

390
static void
391
calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
392
                       struct list_head *schedule_list)
393
{
394
        struct schedule_setup_state state;
395

396
        memset(&state, 0, sizeof(state));
397
        state.dir = R;
398
        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
399
                                              c->num_temps);
400

401
        list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
402
                calculate_deps(&state, n);
403
        }
404
}
405

406
static int
407
get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
408
{
409
        int cost = 0;
410

411
        if (inst->dst.file == QFILE_TEMP &&
412
            state->temp_writes[inst->dst.index] == 1)
413
                cost--;
414

415
        for (int i = 0; i < qir_get_nsrc(inst); i++) {
416
                if (inst->src[i].file != QFILE_TEMP ||
417
                    BITSET_TEST(state->temp_live, inst->src[i].index)) {
418
                        continue;
419
                }
420

421
                bool already_counted = false;
422
                for (int j = 0; j < i; j++) {
423
                        if (inst->src[i].file == inst->src[j].file &&
424
                            inst->src[i].index == inst->src[j].index) {
425
                                already_counted = true;
426
                        }
427
                }
428
                if (!already_counted)
429
                        cost++;
430
        }
431

432
        return cost;
433
}
434

435
static bool
436
locks_scoreboard(struct qinst *inst)
437
{
438
        if (inst->op == QOP_TLB_COLOR_READ)
439
                return true;
440

441
        switch (inst->dst.file) {
442
        case QFILE_TLB_Z_WRITE:
443
        case QFILE_TLB_COLOR_WRITE:
444
        case QFILE_TLB_COLOR_WRITE_MS:
445
                return true;
446
        default:
447
                return false;
448
        }
449
}
450

451
static struct schedule_node *
452
choose_instruction(struct schedule_state *state)
453
{
454
        struct schedule_node *chosen = NULL;
455

456
        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
457
                            dag.link) {
458
                /* The branches aren't being tracked as dependencies.  Make
459
                 * sure that they stay scheduled as the last instruction of
460
                 * the block, which is to say the first one we choose to
461
                 * schedule.
462
                 */
463
                if (n->inst->op == QOP_BRANCH)
464
                        return n;
465

466
                if (!chosen) {
467
                        chosen = n;
468
                        continue;
469
                }
470

471
                /* Prefer scheduling things that lock the scoreboard, so that
472
                 * they appear late in the program and we get more parallelism
473
                 * between shaders on multiple QPUs hitting the same fragment.
474
                 */
475
                if (locks_scoreboard(n->inst) &&
476
                    !locks_scoreboard(chosen->inst)) {
477
                        chosen = n;
478
                        continue;
479
                } else if (!locks_scoreboard(n->inst) &&
480
                           locks_scoreboard(chosen->inst)) {
481
                        continue;
482
                }
483

484
                /* If we would block on the previously chosen node, but would
485
                 * block less on this one, then prefer it.
486
                 */
487
                if (chosen->unblocked_time > state->time &&
488
                    n->unblocked_time < chosen->unblocked_time) {
489
                        chosen = n;
490
                        continue;
491
                } else if (n->unblocked_time > state->time &&
492
                           n->unblocked_time > chosen->unblocked_time) {
493
                        continue;
494
                }
495

496
                /* If we can definitely reduce register pressure, do so
497
                 * immediately.
498
                 */
499
                int register_pressure_cost =
500
                        get_register_pressure_cost(state, n->inst);
501
                int chosen_register_pressure_cost =
502
                        get_register_pressure_cost(state, chosen->inst);
503

504
                if (register_pressure_cost < chosen_register_pressure_cost) {
505
                        chosen = n;
506
                        continue;
507
                } else if (register_pressure_cost >
508
                           chosen_register_pressure_cost) {
509
                        continue;
510
                }
511

512
                /* Otherwise, prefer instructions with the deepest chain to
513
                 * the end of the program.  This avoids the problem of
514
                 * "everything generates a temp, nothing finishes freeing one,
515
                 * guess I'll just keep emitting varying mul/adds".
516
                 */
517
                if (n->delay > chosen->delay) {
518
                        chosen = n;
519
                        continue;
520
                } else if (n->delay < chosen->delay) {
521
                        continue;
522
                }
523
        }
524

525
        return chosen;
526
}
527

528
static void
529
dump_state(struct vc4_compile *c, struct schedule_state *state)
530
{
531
        uint32_t i = 0;
532
        list_for_each_entry(struct schedule_node, n, &state->dag->heads,
533
                            dag.link) {
534
                fprintf(stderr, "%3d: ", i++);
535
                qir_dump_inst(c, n->inst);
536
                fprintf(stderr, " (%d cost)\n",
537
                        get_register_pressure_cost(state, n->inst));
538

539
                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
540
                        struct schedule_node *child =
541
                                (struct schedule_node *)edge->child;
542
                        fprintf(stderr, "   - ");
543
                        qir_dump_inst(c, child->inst);
544
                        fprintf(stderr, " (%d parents)\n",
545
                                child->dag.parent_count);
546
                }
547
        }
548
}
549

550
/* Estimate of how many instructions we should schedule between operations.
551
 *
552
 * These aren't in real cycle counts, because we're just estimating cycle
553
 * times anyway.  QIR instructions will get paired up when turned into QPU
554
 * instructions, or extra NOP delays will have to be added due to register
555
 * allocation choices.
556
 */
557
static uint32_t
558
latency_between(struct schedule_node *before, struct schedule_node *after)
559
{
560
        if ((before->inst->dst.file == QFILE_TEX_S ||
561
             before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
562
            after->inst->op == QOP_TEX_RESULT)
563
                return 100;
564

565
        switch (before->inst->op) {
566
        case QOP_RCP:
567
        case QOP_RSQ:
568
        case QOP_EXP2:
569
        case QOP_LOG2:
570
                for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
571
                        if (after->inst->src[i].file ==
572
                            before->inst->dst.file &&
573
                            after->inst->src[i].index ==
574
                            before->inst->dst.index) {
575
                                /* There are two QPU delay slots before we can
576
                                 * read a math result, which could be up to 4
577
                                 * QIR instructions if they packed well.
578
                                 */
579
                                return 4;
580
                        }
581
                }
582
                break;
583
        default:
584
                break;
585
        }
586

587
        return 1;
588
}
589

590
/** Recursive computation of the delay member of a node. */
591
static void
592
compute_delay(struct dag_node *node, void *state)
593
{
594
        struct schedule_node *n = (struct schedule_node *)node;
595

596
        /* The color read needs to be scheduled late, to avoid locking the
597
         * scoreboard early.  This is our best tool for encouraging that.  The
598
         * other scoreboard locking ops will have this happen by default,
599
         * since they are generally the DAG heads or close to them.
600
         */
601
        if (n->inst->op == QOP_TLB_COLOR_READ)
602
                n->delay = 1000;
603
        else
604
                n->delay = 1;
605

606
        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
607
                struct schedule_node *child =
608
                        (struct schedule_node *)edge->child;
609

610
                n->delay = MAX2(n->delay, (child->delay +
611
                                           latency_between(child, n)));
612
        }
613
}
614

615
static void
616
schedule_instructions(struct vc4_compile *c,
617
                      struct qblock *block, struct schedule_state *state)
618
{
619
        if (debug) {
620
                fprintf(stderr, "initial deps:\n");
621
                dump_state(c, state);
622
        }
623

624
        state->time = 0;
625
        while (!list_is_empty(&state->dag->heads)) {
626
                struct schedule_node *chosen = choose_instruction(state);
627
                struct qinst *inst = chosen->inst;
628

629
                if (debug) {
630
                        fprintf(stderr, "current list:\n");
631
                        dump_state(c, state);
632
                        fprintf(stderr, "chose: ");
633
                        qir_dump_inst(c, inst);
634
                        fprintf(stderr, " (%d cost)\n",
635
                                get_register_pressure_cost(state, inst));
636
                }
637

638
                state->time = MAX2(state->time, chosen->unblocked_time);
639

640
                /* Schedule this instruction back onto the QIR list. */
641
                list_add(&inst->link, &block->instructions);
642

643
                /* Now that we've scheduled a new instruction, some of its
644
                 * children can be promoted to the list of instructions ready to
645
                 * be scheduled.  Update the children's unblocked time for this
646
                 * DAG edge as we do so.
647
                 */
648
                util_dynarray_foreach(&chosen->dag.edges,
649
                                      struct dag_edge, edge) {
650
                        struct schedule_node *child =
651
                                (struct schedule_node *)edge->child;
652

653
                        child->unblocked_time = MAX2(child->unblocked_time,
654
                                                     state->time +
655
                                                     latency_between(child,
656
                                                                     chosen));
657
                }
658
                dag_prune_head(state->dag, &chosen->dag);
659

660
                /* Update our tracking of register pressure. */
661
                for (int i = 0; i < qir_get_nsrc(inst); i++) {
662
                        if (inst->src[i].file == QFILE_TEMP)
663
                                BITSET_SET(state->temp_live, inst->src[i].index);
664
                }
665
                if (inst->dst.file == QFILE_TEMP) {
666
                        state->temp_writes[inst->dst.index]--;
667
                        if (state->temp_writes[inst->dst.index] == 0)
668
                                BITSET_CLEAR(state->temp_live, inst->dst.index);
669
                }
670

671
                state->time++;
672
        }
673
}
674

675
static void
676
qir_schedule_instructions_block(struct vc4_compile *c,
677
                                struct qblock *block)
678
{
679
        struct schedule_state *state = rzalloc(NULL, struct schedule_state);
680

681
        state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
682
        state->temp_live = rzalloc_array(state, BITSET_WORD,
683
                                         BITSET_WORDS(c->num_temps));
684
        state->dag = dag_create(state);
685

686
        struct list_head setup_list;
687
        list_inithead(&setup_list);
688

689
        /* Wrap each instruction in a scheduler structure. */
690
        qir_for_each_inst_safe(inst, block) {
691
                struct schedule_node *n = rzalloc(state, struct schedule_node);
692

693
                n->inst = inst;
694
                list_del(&inst->link);
695
                list_addtail(&n->link, &setup_list);
696
                dag_init_node(state->dag, &n->dag);
697

698
                if (inst->dst.file == QFILE_TEMP)
699
                        state->temp_writes[inst->dst.index]++;
700
        }
701

702
        /* Dependencies tracked top-to-bottom. */
703
        calculate_forward_deps(c, state, &setup_list);
704
        /* Dependencies tracked bottom-to-top. */
705
        calculate_reverse_deps(c, state, &setup_list);
706

707
        dag_traverse_bottom_up(state->dag, compute_delay, NULL);
708

709
        schedule_instructions(c, block, state);
710

711
        ralloc_free(state);
712
}
713

714
void
715
qir_schedule_instructions(struct vc4_compile *c)
716
{
717

718
        if (debug) {
719
                fprintf(stderr, "Pre-schedule instructions\n");
720
                qir_dump(c);
721
        }
722

723
        qir_for_each_block(block, c)
724
                qir_schedule_instructions_block(c, block);
725

726
        if (debug) {
727
                fprintf(stderr, "Post-schedule instructions\n");
728
                qir_dump(c);
729
        }
730
}
731

732
Product

Resources

Company