CoCalc -- vc4_qpu

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_schedule.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright © 2010 Intel Corporation
3
 * Copyright © 2014 Broadcom
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
/**
26
 * @file vc4_qpu_schedule.c
27
 *
28
 * The basic model of the list scheduler is to take a basic block, compute a
29
 * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30
 * pick a DAG head, then put all the children that are now DAG heads into the
31
 * list of things to schedule.
32
 *
33
 * The goal of scheduling here is to pack pairs of operations together in a
34
 * single QPU instruction.
35
 */
36

37
#include "vc4_qir.h"
38
#include "vc4_qpu.h"
39
#include "util/ralloc.h"
40
#include "util/dag.h"
41

42
static bool debug;
43

44
struct schedule_node_child;
45

46
struct schedule_node {
47
        struct dag_node dag;
48
        struct list_head link;
49
        struct queued_qpu_inst *inst;
50

51
        /* Longest cycles + instruction_latency() of any parent of this node. */
52
        uint32_t unblocked_time;
53

54
        /**
55
         * Minimum number of cycles from scheduling this instruction until the
56
         * end of the program, based on the slowest dependency chain through
57
         * the children.
58
         */
59
        uint32_t delay;
60

61
        /**
62
         * cycles between this instruction being scheduled and when its result
63
         * can be consumed.
64
         */
65
        uint32_t latency;
66

67
        /**
68
         * Which uniform from uniform_data[] this instruction read, or -1 if
69
         * not reading a uniform.
70
         */
71
        int uniform;
72
};
73

74
/* When walking the instructions in reverse, we need to swap before/after in
75
 * add_dep().
76
 */
77
enum direction { F, R };
78

79
struct schedule_state {
80
        struct dag *dag;
81
        struct schedule_node *last_r[6];
82
        struct schedule_node *last_ra[32];
83
        struct schedule_node *last_rb[32];
84
        struct schedule_node *last_sf;
85
        struct schedule_node *last_vpm_read;
86
        struct schedule_node *last_tmu_write;
87
        struct schedule_node *last_tlb;
88
        struct schedule_node *last_vpm;
89
        struct schedule_node *last_uniforms_reset;
90
        enum direction dir;
91
        /* Estimated cycle when the current instruction would start. */
92
        uint32_t time;
93
};
94

95
static void
96
add_dep(struct schedule_state *state,
97
        struct schedule_node *before,
98
        struct schedule_node *after,
99
        bool write)
100
{
101
        bool write_after_read = !write && state->dir == R;
102
        void *edge_data = (void *)(uintptr_t)write_after_read;
103

104
        if (!before || !after)
105
                return;
106

107
        assert(before != after);
108

109
        if (state->dir == F)
110
                dag_add_edge(&before->dag, &after->dag, edge_data);
111
        else
112
                dag_add_edge(&after->dag, &before->dag, edge_data);
113
}
114

115
static void
116
add_read_dep(struct schedule_state *state,
117
              struct schedule_node *before,
118
              struct schedule_node *after)
119
{
120
        add_dep(state, before, after, false);
121
}
122

123
static void
124
add_write_dep(struct schedule_state *state,
125
              struct schedule_node **before,
126
              struct schedule_node *after)
127
{
128
        add_dep(state, *before, after, true);
129
        *before = after;
130
}
131

132
static bool
133
qpu_writes_r4(uint64_t inst)
134
{
135
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
136

137
        switch(sig) {
138
        case QPU_SIG_COLOR_LOAD:
139
        case QPU_SIG_LOAD_TMU0:
140
        case QPU_SIG_LOAD_TMU1:
141
        case QPU_SIG_ALPHA_MASK_LOAD:
142
                return true;
143
        default:
144
                return false;
145
        }
146
}
147

148
static void
149
process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
150
                   uint32_t raddr, bool is_a)
151
{
152
        switch (raddr) {
153
        case QPU_R_VARY:
154
                add_write_dep(state, &state->last_r[5], n);
155
                break;
156

157
        case QPU_R_VPM:
158
                add_write_dep(state, &state->last_vpm_read, n);
159
                break;
160

161
        case QPU_R_UNIF:
162
                add_read_dep(state, state->last_uniforms_reset, n);
163
                break;
164

165
        case QPU_R_NOP:
166
        case QPU_R_ELEM_QPU:
167
        case QPU_R_XY_PIXEL_COORD:
168
        case QPU_R_MS_REV_FLAGS:
169
                break;
170

171
        default:
172
                if (raddr < 32) {
173
                        if (is_a)
174
                                add_read_dep(state, state->last_ra[raddr], n);
175
                        else
176
                                add_read_dep(state, state->last_rb[raddr], n);
177
                } else {
178
                        fprintf(stderr, "unknown raddr %d\n", raddr);
179
                        abort();
180
                }
181
                break;
182
        }
183
}
184

185
static bool
186
is_tmu_write(uint32_t waddr)
187
{
188
        switch (waddr) {
189
        case QPU_W_TMU0_S:
190
        case QPU_W_TMU0_T:
191
        case QPU_W_TMU0_R:
192
        case QPU_W_TMU0_B:
193
        case QPU_W_TMU1_S:
194
        case QPU_W_TMU1_T:
195
        case QPU_W_TMU1_R:
196
        case QPU_W_TMU1_B:
197
                return true;
198
        default:
199
                return false;
200
        }
201
}
202

203
static bool
204
reads_uniform(uint64_t inst)
205
{
206
        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
207
                return false;
208

209
        return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
210
                (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
211
                 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
212
                is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
213
                is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
214
}
215

216
static void
217
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
218
                 uint32_t mux)
219
{
220
        if (mux != QPU_MUX_A && mux != QPU_MUX_B)
221
                add_read_dep(state, state->last_r[mux], n);
222
}
223

224

225
static void
226
process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
227
                   uint32_t waddr, bool is_add)
228
{
229
        uint64_t inst = n->inst->inst;
230
        bool is_a = is_add ^ ((inst & QPU_WS) != 0);
231

232
        if (waddr < 32) {
233
                if (is_a) {
234
                        add_write_dep(state, &state->last_ra[waddr], n);
235
                } else {
236
                        add_write_dep(state, &state->last_rb[waddr], n);
237
                }
238
        } else if (is_tmu_write(waddr)) {
239
                add_write_dep(state, &state->last_tmu_write, n);
240
                add_read_dep(state, state->last_uniforms_reset, n);
241
        } else if (qpu_waddr_is_tlb(waddr) ||
242
                   waddr == QPU_W_MS_FLAGS) {
243
                add_write_dep(state, &state->last_tlb, n);
244
        } else {
245
                switch (waddr) {
246
                case QPU_W_ACC0:
247
                case QPU_W_ACC1:
248
                case QPU_W_ACC2:
249
                case QPU_W_ACC3:
250
                case QPU_W_ACC5:
251
                        add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
252
                                      n);
253
                        break;
254

255
                case QPU_W_VPM:
256
                        add_write_dep(state, &state->last_vpm, n);
257
                        break;
258

259
                case QPU_W_VPMVCD_SETUP:
260
                        if (is_a)
261
                                add_write_dep(state, &state->last_vpm_read, n);
262
                        else
263
                                add_write_dep(state, &state->last_vpm, n);
264
                        break;
265

266
                case QPU_W_SFU_RECIP:
267
                case QPU_W_SFU_RECIPSQRT:
268
                case QPU_W_SFU_EXP:
269
                case QPU_W_SFU_LOG:
270
                        add_write_dep(state, &state->last_r[4], n);
271
                        break;
272

273
                case QPU_W_TLB_STENCIL_SETUP:
274
                        /* This isn't a TLB operation that does things like
275
                         * implicitly lock the scoreboard, but it does have to
276
                         * appear before TLB_Z, and each of the TLB_STENCILs
277
                         * have to schedule in the same order relative to each
278
                         * other.
279
                         */
280
                        add_write_dep(state, &state->last_tlb, n);
281
                        break;
282

283
                case QPU_W_MS_FLAGS:
284
                        add_write_dep(state, &state->last_tlb, n);
285
                        break;
286

287
                case QPU_W_UNIFORMS_ADDRESS:
288
                        add_write_dep(state, &state->last_uniforms_reset, n);
289
                        break;
290

291
                case QPU_W_NOP:
292
                        break;
293

294
                default:
295
                        fprintf(stderr, "Unknown waddr %d\n", waddr);
296
                        abort();
297
                }
298
        }
299
}
300

301
static void
302
process_cond_deps(struct schedule_state *state, struct schedule_node *n,
303
                  uint32_t cond)
304
{
305
        switch (cond) {
306
        case QPU_COND_NEVER:
307
        case QPU_COND_ALWAYS:
308
                break;
309
        default:
310
                add_read_dep(state, state->last_sf, n);
311
                break;
312
        }
313
}
314

315
/**
316
 * Common code for dependencies that need to be tracked both forward and
317
 * backward.
318
 *
319
 * This is for things like "all reads of r4 have to happen between the r4
320
 * writes that surround them".
321
 */
322
static void
323
calculate_deps(struct schedule_state *state, struct schedule_node *n)
324
{
325
        uint64_t inst = n->inst->inst;
326
        uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
327
        uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
328
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
329
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
330
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
331
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
332
        uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
333
        uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
334
        uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
335
        uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
336
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
337

338
        if (sig != QPU_SIG_LOAD_IMM) {
339
                process_raddr_deps(state, n, raddr_a, true);
340
                if (sig != QPU_SIG_SMALL_IMM &&
341
                    sig != QPU_SIG_BRANCH)
342
                        process_raddr_deps(state, n, raddr_b, false);
343
        }
344

345
        if (add_op != QPU_A_NOP) {
346
                process_mux_deps(state, n, add_a);
347
                process_mux_deps(state, n, add_b);
348
        }
349
        if (mul_op != QPU_M_NOP) {
350
                process_mux_deps(state, n, mul_a);
351
                process_mux_deps(state, n, mul_b);
352
        }
353

354
        process_waddr_deps(state, n, waddr_add, true);
355
        process_waddr_deps(state, n, waddr_mul, false);
356
        if (qpu_writes_r4(inst))
357
                add_write_dep(state, &state->last_r[4], n);
358

359
        switch (sig) {
360
        case QPU_SIG_SW_BREAKPOINT:
361
        case QPU_SIG_NONE:
362
        case QPU_SIG_SMALL_IMM:
363
        case QPU_SIG_LOAD_IMM:
364
                break;
365

366
        case QPU_SIG_THREAD_SWITCH:
367
        case QPU_SIG_LAST_THREAD_SWITCH:
368
                /* All accumulator contents and flags are undefined after the
369
                 * switch.
370
                 */
371
                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
372
                        add_write_dep(state, &state->last_r[i], n);
373
                add_write_dep(state, &state->last_sf, n);
374

375
                /* Scoreboard-locking operations have to stay after the last
376
                 * thread switch.
377
                 */
378
                add_write_dep(state, &state->last_tlb, n);
379

380
                add_write_dep(state, &state->last_tmu_write, n);
381
                break;
382

383
        case QPU_SIG_LOAD_TMU0:
384
        case QPU_SIG_LOAD_TMU1:
385
                /* TMU loads are coming from a FIFO, so ordering is important.
386
                 */
387
                add_write_dep(state, &state->last_tmu_write, n);
388
                break;
389

390
        case QPU_SIG_COLOR_LOAD:
391
                add_read_dep(state, state->last_tlb, n);
392
                break;
393

394
        case QPU_SIG_BRANCH:
395
                add_read_dep(state, state->last_sf, n);
396
                break;
397

398
        case QPU_SIG_PROG_END:
399
        case QPU_SIG_WAIT_FOR_SCOREBOARD:
400
        case QPU_SIG_SCOREBOARD_UNLOCK:
401
        case QPU_SIG_COVERAGE_LOAD:
402
        case QPU_SIG_COLOR_LOAD_END:
403
        case QPU_SIG_ALPHA_MASK_LOAD:
404
                fprintf(stderr, "Unhandled signal bits %d\n", sig);
405
                abort();
406
        }
407

408
        process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
409
        process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
410
        if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
411
                add_write_dep(state, &state->last_sf, n);
412
}
413

414
static void
415
calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
416
                       struct list_head *schedule_list)
417
{
418
        struct schedule_state state;
419

420
        memset(&state, 0, sizeof(state));
421
        state.dag = dag;
422
        state.dir = F;
423

424
        list_for_each_entry(struct schedule_node, node, schedule_list, link)
425
                calculate_deps(&state, node);
426
}
427

428
static void
429
calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
430
                       struct list_head *schedule_list)
431
{
432
        struct schedule_state state;
433

434
        memset(&state, 0, sizeof(state));
435
        state.dag = dag;
436
        state.dir = R;
437

438
        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
439
                                link) {
440
                calculate_deps(&state, (struct schedule_node *)node);
441
        }
442
}
443

444
struct choose_scoreboard {
445
        struct dag *dag;
446
        int tick;
447
        int last_sfu_write_tick;
448
        int last_uniforms_reset_tick;
449
        uint32_t last_waddr_a, last_waddr_b;
450
        bool tlb_locked;
451
};
452

453
static bool
454
reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
455
{
456
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
457
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
458
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
459

460
        /* Full immediate loads don't read any registers. */
461
        if (sig == QPU_SIG_LOAD_IMM)
462
                return false;
463

464
        uint32_t src_muxes[] = {
465
                QPU_GET_FIELD(inst, QPU_ADD_A),
466
                QPU_GET_FIELD(inst, QPU_ADD_B),
467
                QPU_GET_FIELD(inst, QPU_MUL_A),
468
                QPU_GET_FIELD(inst, QPU_MUL_B),
469
        };
470
        for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
471
                if ((src_muxes[i] == QPU_MUX_A &&
472
                     raddr_a < 32 &&
473
                     scoreboard->last_waddr_a == raddr_a) ||
474
                    (src_muxes[i] == QPU_MUX_B &&
475
                     sig != QPU_SIG_SMALL_IMM &&
476
                     raddr_b < 32 &&
477
                     scoreboard->last_waddr_b == raddr_b)) {
478
                        return true;
479
                }
480

481
                if (src_muxes[i] == QPU_MUX_R4) {
482
                        if (scoreboard->tick -
483
                            scoreboard->last_sfu_write_tick <= 2) {
484
                                return true;
485
                        }
486
                }
487
        }
488

489
        if (sig == QPU_SIG_SMALL_IMM &&
490
            QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
491
                uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
492
                uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
493

494
                if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
495
                    scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
496
                    scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
497
                    scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
498
                        return true;
499
                }
500
        }
501

502
        if (reads_uniform(inst) &&
503
            scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
504
                return true;
505
        }
506

507
        return false;
508
}
509

510
static bool
511
pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
512
{
513
        return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
514
}
515

516
static int
517
get_instruction_priority(uint64_t inst)
518
{
519
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
520
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
521
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
522
        uint32_t baseline_score;
523
        uint32_t next_score = 0;
524

525
        /* Schedule TLB operations as late as possible, to get more
526
         * parallelism between shaders.
527
         */
528
        if (qpu_inst_is_tlb(inst))
529
                return next_score;
530
        next_score++;
531

532
        /* Schedule texture read results collection late to hide latency. */
533
        if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
534
                return next_score;
535
        next_score++;
536

537
        /* Default score for things that aren't otherwise special. */
538
        baseline_score = next_score;
539
        next_score++;
540

541
        /* Schedule texture read setup early to hide their latency better. */
542
        if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
543
                return next_score;
544
        next_score++;
545

546
        return baseline_score;
547
}
548

549
static struct schedule_node *
550
choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
551
                               struct list_head *schedule_list,
552
                               struct schedule_node *prev_inst)
553
{
554
        struct schedule_node *chosen = NULL;
555
        int chosen_prio = 0;
556

557
        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
558
         * will handle pairing it along with filling the delay slots.
559
         */
560
        if (prev_inst) {
561
                uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
562
                                                  QPU_SIG);
563
                if (prev_sig == QPU_SIG_THREAD_SWITCH ||
564
                    prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
565
                        return NULL;
566
                }
567
        }
568

569
        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
570
                            dag.link) {
571
                uint64_t inst = n->inst->inst;
572
                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
573

574
                /* Don't choose the branch instruction until it's the last one
575
                 * left.  XXX: We could potentially choose it before it's the
576
                 * last one, if the remaining instructions fit in the delay
577
                 * slots.
578
                 */
579
                if (sig == QPU_SIG_BRANCH &&
580
                    !list_is_singular(&scoreboard->dag->heads)) {
581
                        continue;
582
                }
583

584
                /* "An instruction must not read from a location in physical
585
                 *  regfile A or B that was written to by the previous
586
                 *  instruction."
587
                 */
588
                if (reads_too_soon_after_write(scoreboard, inst))
589
                        continue;
590

591
                /* "A scoreboard wait must not occur in the first two
592
                 *  instructions of a fragment shader. This is either the
593
                 *  explicit Wait for Scoreboard signal or an implicit wait
594
                 *  with the first tile-buffer read or write instruction."
595
                 */
596
                if (pixel_scoreboard_too_soon(scoreboard, inst))
597
                        continue;
598

599
                /* If we're trying to pair with another instruction, check
600
                 * that they're compatible.
601
                 */
602
                if (prev_inst) {
603
                        /* Don't pair up a thread switch signal -- we'll
604
                         * handle pairing it when we pick it on its own.
605
                         */
606
                        if (sig == QPU_SIG_THREAD_SWITCH ||
607
                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
608
                                continue;
609
                        }
610

611
                        if (prev_inst->uniform != -1 && n->uniform != -1)
612
                                continue;
613

614
                        /* Don't merge in something that will lock the TLB.
615
                         * Hopefully what we have in inst will release some
616
                         * other instructions, allowing us to delay the
617
                         * TLB-locking instruction until later.
618
                         */
619
                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
620
                                continue;
621

622
                        inst = qpu_merge_inst(prev_inst->inst->inst, inst);
623
                        if (!inst)
624
                                continue;
625
                }
626

627
                int prio = get_instruction_priority(inst);
628

629
                /* Found a valid instruction.  If nothing better comes along,
630
                 * this one works.
631
                 */
632
                if (!chosen) {
633
                        chosen = n;
634
                        chosen_prio = prio;
635
                        continue;
636
                }
637

638
                if (prio > chosen_prio) {
639
                        chosen = n;
640
                        chosen_prio = prio;
641
                } else if (prio < chosen_prio) {
642
                        continue;
643
                }
644

645
                if (n->delay > chosen->delay) {
646
                        chosen = n;
647
                        chosen_prio = prio;
648
                } else if (n->delay < chosen->delay) {
649
                        continue;
650
                }
651
        }
652

653
        return chosen;
654
}
655

656
static void
657
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
658
                             uint64_t inst)
659
{
660
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
661
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
662

663
        if (!(inst & QPU_WS)) {
664
                scoreboard->last_waddr_a = waddr_add;
665
                scoreboard->last_waddr_b = waddr_mul;
666
        } else {
667
                scoreboard->last_waddr_b = waddr_add;
668
                scoreboard->last_waddr_a = waddr_mul;
669
        }
670

671
        if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
672
            (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
673
                scoreboard->last_sfu_write_tick = scoreboard->tick;
674
        }
675

676
        if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
677
            waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
678
                scoreboard->last_uniforms_reset_tick = scoreboard->tick;
679
        }
680

681
        if (qpu_inst_is_tlb(inst))
682
                scoreboard->tlb_locked = true;
683
}
684

685
static void
686
dump_state(struct dag *dag)
687
{
688
        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
689
                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
690
                vc4_qpu_disasm(&n->inst->inst, 1);
691
                fprintf(stderr, "\n");
692

693
                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
694
                        struct schedule_node *child =
695
                                (struct schedule_node *)edge->child;
696
                        if (!child)
697
                                continue;
698

699
                        fprintf(stderr, "                 - ");
700
                        vc4_qpu_disasm(&child->inst->inst, 1);
701
                        fprintf(stderr, " (%d parents, %c)\n",
702
                                child->dag.parent_count,
703
                                edge->data ? 'w' : 'r');
704
                }
705
        }
706
}
707

708
static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
709
{
710
        if (waddr < 32)
711
                return 2;
712

713
        /* Apply some huge latency between texture fetch requests and getting
714
         * their results back.
715
         *
716
         * FIXME: This is actually pretty bogus.  If we do:
717
         *
718
         * mov tmu0_s, a
719
         * <a bit of math>
720
         * mov tmu0_s, b
721
         * load_tmu0
722
         * <more math>
723
         * load_tmu0
724
         *
725
         * we count that as worse than
726
         *
727
         * mov tmu0_s, a
728
         * mov tmu0_s, b
729
         * <lots of math>
730
         * load_tmu0
731
         * <more math>
732
         * load_tmu0
733
         *
734
         * because we associate the first load_tmu0 with the *second* tmu0_s.
735
         */
736
        if (waddr == QPU_W_TMU0_S) {
737
                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
738
                        return 100;
739
        }
740
        if (waddr == QPU_W_TMU1_S) {
741
                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
742
                        return 100;
743
        }
744

745
        switch(waddr) {
746
        case QPU_W_SFU_RECIP:
747
        case QPU_W_SFU_RECIPSQRT:
748
        case QPU_W_SFU_EXP:
749
        case QPU_W_SFU_LOG:
750
                return 3;
751
        default:
752
                return 1;
753
        }
754
}
755

756
static uint32_t
757
instruction_latency(struct schedule_node *before, struct schedule_node *after)
758
{
759
        uint64_t before_inst = before->inst->inst;
760
        uint64_t after_inst = after->inst->inst;
761

762
        return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
763
                                  after_inst),
764
                    waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
765
                                  after_inst));
766
}
767

768
/** Recursive computation of the delay member of a node. */
769
static void
770
compute_delay(struct dag_node *node, void *state)
771
{
772
        struct schedule_node *n = (struct schedule_node *)node;
773

774
        n->delay = 1;
775

776
        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
777
                struct schedule_node *child =
778
                        (struct schedule_node *)edge->child;
779
                n->delay = MAX2(n->delay, (child->delay +
780
                                           instruction_latency(n, child)));
781
        }
782
}
783

784
/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
785
 * should be called on it later to finish pruning the other edges).
786
 */
787
static void
788
pre_remove_head(struct dag *dag, struct schedule_node *n)
789
{
790
        list_delinit(&n->dag.link);
791

792
        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
793
                if (edge->data)
794
                        dag_remove_edge(dag, edge);
795
        }
796
}
797

798
static void
799
mark_instruction_scheduled(struct dag *dag,
800
                           uint32_t time,
801
                           struct schedule_node *node)
802
{
803
        if (!node)
804
                return;
805

806
        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
807
                struct schedule_node *child =
808
                        (struct schedule_node *)edge->child;
809

810
                if (!child)
811
                        continue;
812

813
                uint32_t latency = instruction_latency(node, child);
814

815
                child->unblocked_time = MAX2(child->unblocked_time,
816
                                             time + latency);
817
        }
818
        dag_prune_head(dag, &node->dag);
819
}
820

821
/**
822
 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
823
 * with another instruction.
824
 */
825
static void
826
emit_thrsw(struct vc4_compile *c,
827
           struct choose_scoreboard *scoreboard,
828
           uint64_t inst)
829
{
830
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
831

832
        /* There should be nothing in a thrsw inst being scheduled other than
833
         * the signal bits.
834
         */
835
        assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
836
        assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
837

838
        /* Try to find an earlier scheduled instruction that we can merge the
839
         * thrsw into.
840
         */
841
        int thrsw_ip = c->qpu_inst_count;
842
        for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
843
                uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
844
                uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
845

846
                if (prev_sig == QPU_SIG_NONE)
847
                        thrsw_ip = c->qpu_inst_count - i;
848
        }
849

850
        if (thrsw_ip != c->qpu_inst_count) {
851
                /* Merge the thrsw into the existing instruction. */
852
                c->qpu_insts[thrsw_ip] =
853
                        QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
854
        } else {
855
                qpu_serialize_one_inst(c, inst);
856
                update_scoreboard_for_chosen(scoreboard, inst);
857
        }
858

859
        /* Fill the delay slots. */
860
        while (c->qpu_inst_count < thrsw_ip + 3) {
861
                update_scoreboard_for_chosen(scoreboard, qpu_NOP());
862
                qpu_serialize_one_inst(c, qpu_NOP());
863
        }
864
}
865

866
static uint32_t
867
schedule_instructions(struct vc4_compile *c,
868
                      struct choose_scoreboard *scoreboard,
869
                      struct qblock *block,
870
                      struct list_head *schedule_list,
871
                      enum quniform_contents *orig_uniform_contents,
872
                      uint32_t *orig_uniform_data,
873
                      uint32_t *next_uniform)
874
{
875
        uint32_t time = 0;
876

877
        while (!list_is_empty(&scoreboard->dag->heads)) {
878
                struct schedule_node *chosen =
879
                        choose_instruction_to_schedule(scoreboard,
880
                                                       schedule_list,
881
                                                       NULL);
882
                struct schedule_node *merge = NULL;
883

884
                /* If there are no valid instructions to schedule, drop a NOP
885
                 * in.
886
                 */
887
                uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
888

889
                if (debug) {
890
                        fprintf(stderr, "t=%4d: current list:\n",
891
                                time);
892
                        dump_state(scoreboard->dag);
893
                        fprintf(stderr, "t=%4d: chose: ", time);
894
                        vc4_qpu_disasm(&inst, 1);
895
                        fprintf(stderr, "\n");
896
                }
897

898
                /* Schedule this instruction onto the QPU list. Also try to
899
                 * find an instruction to pair with it.
900
                 */
901
                if (chosen) {
902
                        time = MAX2(chosen->unblocked_time, time);
903
                        pre_remove_head(scoreboard->dag, chosen);
904
                        if (chosen->uniform != -1) {
905
                                c->uniform_data[*next_uniform] =
906
                                        orig_uniform_data[chosen->uniform];
907
                                c->uniform_contents[*next_uniform] =
908
                                        orig_uniform_contents[chosen->uniform];
909
                                (*next_uniform)++;
910
                        }
911

912
                        merge = choose_instruction_to_schedule(scoreboard,
913
                                                               schedule_list,
914
                                                               chosen);
915
                        if (merge) {
916
                                time = MAX2(merge->unblocked_time, time);
917
                                inst = qpu_merge_inst(inst, merge->inst->inst);
918
                                assert(inst != 0);
919
                                if (merge->uniform != -1) {
920
                                        c->uniform_data[*next_uniform] =
921
                                                orig_uniform_data[merge->uniform];
922
                                        c->uniform_contents[*next_uniform] =
923
                                                orig_uniform_contents[merge->uniform];
924
                                        (*next_uniform)++;
925
                                }
926

927
                                if (debug) {
928
                                        fprintf(stderr, "t=%4d: merging: ",
929
                                                time);
930
                                        vc4_qpu_disasm(&merge->inst->inst, 1);
931
                                        fprintf(stderr, "\n");
932
                                        fprintf(stderr, "            resulting in: ");
933
                                        vc4_qpu_disasm(&inst, 1);
934
                                        fprintf(stderr, "\n");
935
                                }
936
                        }
937
                }
938

939
                if (debug) {
940
                        fprintf(stderr, "\n");
941
                }
942

943
                /* Now that we've scheduled a new instruction, some of its
944
                 * children can be promoted to the list of instructions ready to
945
                 * be scheduled.  Update the children's unblocked time for this
946
                 * DAG edge as we do so.
947
                 */
948
                mark_instruction_scheduled(scoreboard->dag, time, chosen);
949
                mark_instruction_scheduled(scoreboard->dag, time, merge);
950

951
                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
952
                    QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
953
                        emit_thrsw(c, scoreboard, inst);
954
                } else {
955
                        qpu_serialize_one_inst(c, inst);
956
                        update_scoreboard_for_chosen(scoreboard, inst);
957
                }
958

959
                scoreboard->tick++;
960
                time++;
961

962
                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
963
                        block->branch_qpu_ip = c->qpu_inst_count - 1;
964
                        /* Fill the delay slots.
965
                         *
966
                         * We should fill these with actual instructions,
967
                         * instead, but that will probably need to be done
968
                         * after this, once we know what the leading
969
                         * instructions of the successors are (so we can
970
                         * handle A/B register file write latency)
971
                        */
972
                        inst = qpu_NOP();
973
                        update_scoreboard_for_chosen(scoreboard, inst);
974
                        qpu_serialize_one_inst(c, inst);
975
                        qpu_serialize_one_inst(c, inst);
976
                        qpu_serialize_one_inst(c, inst);
977
                }
978
        }
979

980
        return time;
981
}
982

983
static uint32_t
984
qpu_schedule_instructions_block(struct vc4_compile *c,
985
                                struct choose_scoreboard *scoreboard,
986
                                struct qblock *block,
987
                                enum quniform_contents *orig_uniform_contents,
988
                                uint32_t *orig_uniform_data,
989
                                uint32_t *next_uniform)
990
{
991
        scoreboard->dag = dag_create(NULL);
992
        struct list_head setup_list;
993

994
        list_inithead(&setup_list);
995

996
        /* Wrap each instruction in a scheduler structure. */
997
        uint32_t next_sched_uniform = *next_uniform;
998
        while (!list_is_empty(&block->qpu_inst_list)) {
999
                struct queued_qpu_inst *inst =
1000
                        (struct queued_qpu_inst *)block->qpu_inst_list.next;
1001
                struct schedule_node *n = rzalloc(scoreboard->dag,
1002
                                                  struct schedule_node);
1003

1004
                dag_init_node(scoreboard->dag, &n->dag);
1005
                n->inst = inst;
1006

1007
                if (reads_uniform(inst->inst)) {
1008
                        n->uniform = next_sched_uniform++;
1009
                } else {
1010
                        n->uniform = -1;
1011
                }
1012
                list_del(&inst->link);
1013
                list_addtail(&n->link, &setup_list);
1014
        }
1015

1016
        calculate_forward_deps(c, scoreboard->dag, &setup_list);
1017
        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1018

1019
        dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1020

1021
        uint32_t cycles = schedule_instructions(c, scoreboard, block,
1022
                                                &setup_list,
1023
                                                orig_uniform_contents,
1024
                                                orig_uniform_data,
1025
                                                next_uniform);
1026

1027
        ralloc_free(scoreboard->dag);
1028
        scoreboard->dag = NULL;
1029

1030
        return cycles;
1031
}
1032

1033
static void
1034
qpu_set_branch_targets(struct vc4_compile *c)
1035
{
1036
        qir_for_each_block(block, c) {
1037
                /* The end block of the program has no branch. */
1038
                if (!block->successors[0])
1039
                        continue;
1040

1041
                /* If there was no branch instruction, then the successor
1042
                 * block must follow immediately after this one.
1043
                 */
1044
                if (block->branch_qpu_ip == ~0) {
1045
                        assert(block->end_qpu_ip + 1 ==
1046
                               block->successors[0]->start_qpu_ip);
1047
                        continue;
1048
                }
1049

1050
                /* Set the branch target for the block that doesn't follow
1051
                 * immediately after ours.
1052
                 */
1053
                uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
1054
                assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
1055
                assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
1056

1057
                uint32_t branch_target =
1058
                        (block->successors[0]->start_qpu_ip -
1059
                         (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
1060
                *branch_inst = (*branch_inst |
1061
                                QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
1062

1063
                /* Make sure that the if-we-don't-jump successor was scheduled
1064
                 * just after the delay slots.
1065
                 */
1066
                if (block->successors[1]) {
1067
                        assert(block->successors[1]->start_qpu_ip ==
1068
                               block->branch_qpu_ip + 4);
1069
                }
1070
        }
1071
}
1072

1073
uint32_t
1074
qpu_schedule_instructions(struct vc4_compile *c)
1075
{
1076
        /* We reorder the uniforms as we schedule instructions, so save the
1077
         * old data off and replace it.
1078
         */
1079
        uint32_t *uniform_data = c->uniform_data;
1080
        enum quniform_contents *uniform_contents = c->uniform_contents;
1081
        c->uniform_contents = ralloc_array(c, enum quniform_contents,
1082
                                           c->num_uniforms);
1083
        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1084
        c->uniform_array_size = c->num_uniforms;
1085
        uint32_t next_uniform = 0;
1086

1087
        struct choose_scoreboard scoreboard;
1088
        memset(&scoreboard, 0, sizeof(scoreboard));
1089
        scoreboard.last_waddr_a = ~0;
1090
        scoreboard.last_waddr_b = ~0;
1091
        scoreboard.last_sfu_write_tick = -10;
1092
        scoreboard.last_uniforms_reset_tick = -10;
1093

1094
        if (debug) {
1095
                fprintf(stderr, "Pre-schedule instructions\n");
1096
                qir_for_each_block(block, c) {
1097
                        fprintf(stderr, "BLOCK %d\n", block->index);
1098
                        list_for_each_entry(struct queued_qpu_inst, q,
1099
                                            &block->qpu_inst_list, link) {
1100
                                vc4_qpu_disasm(&q->inst, 1);
1101
                                fprintf(stderr, "\n");
1102
                        }
1103
                }
1104
                fprintf(stderr, "\n");
1105
        }
1106

1107
        uint32_t cycles = 0;
1108
        qir_for_each_block(block, c) {
1109
                block->start_qpu_ip = c->qpu_inst_count;
1110
                block->branch_qpu_ip = ~0;
1111

1112
                cycles += qpu_schedule_instructions_block(c,
1113
                                                          &scoreboard,
1114
                                                          block,
1115
                                                          uniform_contents,
1116
                                                          uniform_data,
1117
                                                          &next_uniform);
1118

1119
                block->end_qpu_ip = c->qpu_inst_count - 1;
1120
        }
1121

1122
        qpu_set_branch_targets(c);
1123

1124
        assert(next_uniform == c->num_uniforms);
1125

1126
        if (debug) {
1127
                fprintf(stderr, "Post-schedule instructions\n");
1128
                vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1129
                fprintf(stderr, "\n");
1130
        }
1131

1132
        return cycles;
1133
}
1134

1135
Product

Resources

Company