CoCalc -- qpu_schedule.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/compiler/qpu_schedule.c
⁴⁵⁶⁴ views
1
/*
2
 * Copyright © 2010 Intel Corporation
3
 * Copyright © 2014-2017 Broadcom
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
/**
26
 * @file
27
 *
28
 * The basic model of the list scheduler is to take a basic block, compute a
29
 * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30
 * pick a DAG head, then put all the children that are now DAG heads into the
31
 * list of things to schedule.
32
 *
33
 * The goal of scheduling here is to pack pairs of operations together in a
34
 * single QPU instruction.
35
 */
36

37
#include "qpu/qpu_disasm.h"
38
#include "v3d_compiler.h"
39
#include "util/ralloc.h"
40
#include "util/dag.h"
41

42
static bool debug;
43

44
struct schedule_node_child;
45

46
struct schedule_node {
47
        struct dag_node dag;
48
        struct list_head link;
49
        struct qinst *inst;
50

51
        /* Longest cycles + instruction_latency() of any parent of this node. */
52
        uint32_t unblocked_time;
53

54
        /**
55
         * Minimum number of cycles from scheduling this instruction until the
56
         * end of the program, based on the slowest dependency chain through
57
         * the children.
58
         */
59
        uint32_t delay;
60

61
        /**
62
         * cycles between this instruction being scheduled and when its result
63
         * can be consumed.
64
         */
65
        uint32_t latency;
66
};
67

68
/* When walking the instructions in reverse, we need to swap before/after in
69
 * add_dep().
70
 */
71
enum direction { F, R };
72

73
struct schedule_state {
74
        const struct v3d_device_info *devinfo;
75
        struct dag *dag;
76
        struct schedule_node *last_r[6];
77
        struct schedule_node *last_rf[64];
78
        struct schedule_node *last_sf;
79
        struct schedule_node *last_vpm_read;
80
        struct schedule_node *last_tmu_write;
81
        struct schedule_node *last_tmu_config;
82
        struct schedule_node *last_tmu_read;
83
        struct schedule_node *last_tlb;
84
        struct schedule_node *last_vpm;
85
        struct schedule_node *last_unif;
86
        struct schedule_node *last_rtop;
87
        struct schedule_node *last_unifa;
88
        enum direction dir;
89
        /* Estimated cycle when the current instruction would start. */
90
        uint32_t time;
91
};
92

93
static void
94
add_dep(struct schedule_state *state,
95
        struct schedule_node *before,
96
        struct schedule_node *after,
97
        bool write)
98
{
99
        bool write_after_read = !write && state->dir == R;
100
        void *edge_data = (void *)(uintptr_t)write_after_read;
101

102
        if (!before || !after)
103
                return;
104

105
        assert(before != after);
106

107
        if (state->dir == F)
108
                dag_add_edge(&before->dag, &after->dag, edge_data);
109
        else
110
                dag_add_edge(&after->dag, &before->dag, edge_data);
111
}
112

113
static void
114
add_read_dep(struct schedule_state *state,
115
              struct schedule_node *before,
116
              struct schedule_node *after)
117
{
118
        add_dep(state, before, after, false);
119
}
120

121
static void
122
add_write_dep(struct schedule_state *state,
123
              struct schedule_node **before,
124
              struct schedule_node *after)
125
{
126
        add_dep(state, *before, after, true);
127
        *before = after;
128
}
129

130
static bool
131
qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
132
{
133
        if (inst->sig.ldtlb || inst->sig.ldtlbu)
134
                return true;
135

136
        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
137
                return false;
138

139
        if (inst->alu.add.magic_write &&
140
            (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
141
             inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
142
                return true;
143

144
        if (inst->alu.mul.magic_write &&
145
            (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
146
             inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
147
                return true;
148

149
        return false;
150
}
151

152
static void
153
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
154
                 enum v3d_qpu_mux mux)
155
{
156
        switch (mux) {
157
        case V3D_QPU_MUX_A:
158
                add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
159
                break;
160
        case V3D_QPU_MUX_B:
161
                if (!n->inst->qpu.sig.small_imm) {
162
                        add_read_dep(state,
163
                                     state->last_rf[n->inst->qpu.raddr_b], n);
164
                }
165
                break;
166
        default:
167
                add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
168
                break;
169
        }
170
}
171

172
static bool
173
tmu_write_is_sequence_terminator(uint32_t waddr)
174
{
175
        switch (waddr) {
176
        case V3D_QPU_WADDR_TMUS:
177
        case V3D_QPU_WADDR_TMUSCM:
178
        case V3D_QPU_WADDR_TMUSF:
179
        case V3D_QPU_WADDR_TMUSLOD:
180
        case V3D_QPU_WADDR_TMUA:
181
        case V3D_QPU_WADDR_TMUAU:
182
                return true;
183
        default:
184
                return false;
185
        }
186
}
187

188
static bool
189
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
190
{
191
        if (devinfo->ver < 40)
192
                return false;
193

194
        if (tmu_write_is_sequence_terminator(waddr))
195
                return false;
196

197
        if (waddr == V3D_QPU_WADDR_TMUD)
198
                return false;
199

200
        return true;
201
}
202

203
static void
204
process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
205
                   uint32_t waddr, bool magic)
206
{
207
        if (!magic) {
208
                add_write_dep(state, &state->last_rf[waddr], n);
209
        } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
210
                if (can_reorder_tmu_write(state->devinfo, waddr))
211
                        add_read_dep(state, state->last_tmu_write, n);
212
                else
213
                        add_write_dep(state, &state->last_tmu_write, n);
214

215
                if (tmu_write_is_sequence_terminator(waddr))
216
                        add_write_dep(state, &state->last_tmu_config, n);
217
        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
218
                /* Handled by v3d_qpu_writes_r4() check. */
219
        } else {
220
                switch (waddr) {
221
                case V3D_QPU_WADDR_R0:
222
                case V3D_QPU_WADDR_R1:
223
                case V3D_QPU_WADDR_R2:
224
                        add_write_dep(state,
225
                                      &state->last_r[waddr - V3D_QPU_WADDR_R0],
226
                                      n);
227
                        break;
228
                case V3D_QPU_WADDR_R3:
229
                case V3D_QPU_WADDR_R4:
230
                case V3D_QPU_WADDR_R5:
231
                        /* Handled by v3d_qpu_writes_r*() checks below. */
232
                        break;
233

234
                case V3D_QPU_WADDR_VPM:
235
                case V3D_QPU_WADDR_VPMU:
236
                        add_write_dep(state, &state->last_vpm, n);
237
                        break;
238

239
                case V3D_QPU_WADDR_TLB:
240
                case V3D_QPU_WADDR_TLBU:
241
                        add_write_dep(state, &state->last_tlb, n);
242
                        break;
243

244
                case V3D_QPU_WADDR_SYNC:
245
                case V3D_QPU_WADDR_SYNCB:
246
                case V3D_QPU_WADDR_SYNCU:
247
                        /* For CS barrier(): Sync against any other memory
248
                         * accesses.  There doesn't appear to be any need for
249
                         * barriers to affect ALU operations.
250
                         */
251
                        add_write_dep(state, &state->last_tmu_write, n);
252
                        add_write_dep(state, &state->last_tmu_read, n);
253
                        break;
254

255
                case V3D_QPU_WADDR_UNIFA:
256
                        if (state->devinfo->ver >= 40)
257
                                add_write_dep(state, &state->last_unifa, n);
258
                        break;
259

260
                case V3D_QPU_WADDR_NOP:
261
                        break;
262

263
                default:
264
                        fprintf(stderr, "Unknown waddr %d\n", waddr);
265
                        abort();
266
                }
267
        }
268
}
269

270
/**
271
 * Common code for dependencies that need to be tracked both forward and
272
 * backward.
273
 *
274
 * This is for things like "all reads of r4 have to happen between the r4
275
 * writes that surround them".
276
 */
277
static void
278
calculate_deps(struct schedule_state *state, struct schedule_node *n)
279
{
280
        const struct v3d_device_info *devinfo = state->devinfo;
281
        struct qinst *qinst = n->inst;
282
        struct v3d_qpu_instr *inst = &qinst->qpu;
283
        /* If the input and output segments are shared, then all VPM reads to
284
         * a location need to happen before all writes.  We handle this by
285
         * serializing all VPM operations for now.
286
         */
287
        bool separate_vpm_segment = false;
288

289
        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
290
                if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
291
                        add_read_dep(state, state->last_sf, n);
292

293
                /* XXX: BDI */
294
                /* XXX: BDU */
295
                /* XXX: ub */
296
                /* XXX: raddr_a */
297

298
                add_write_dep(state, &state->last_unif, n);
299
                return;
300
        }
301

302
        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
303

304
        /* XXX: LOAD_IMM */
305

306
        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
307
                process_mux_deps(state, n, inst->alu.add.a);
308
        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
309
                process_mux_deps(state, n, inst->alu.add.b);
310

311
        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
312
                process_mux_deps(state, n, inst->alu.mul.a);
313
        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
314
                process_mux_deps(state, n, inst->alu.mul.b);
315

316
        switch (inst->alu.add.op) {
317
        case V3D_QPU_A_VPMSETUP:
318
                /* Could distinguish read/write by unpacking the uniform. */
319
                add_write_dep(state, &state->last_vpm, n);
320
                add_write_dep(state, &state->last_vpm_read, n);
321
                break;
322

323
        case V3D_QPU_A_STVPMV:
324
        case V3D_QPU_A_STVPMD:
325
        case V3D_QPU_A_STVPMP:
326
                add_write_dep(state, &state->last_vpm, n);
327
                break;
328

329
        case V3D_QPU_A_LDVPMV_IN:
330
        case V3D_QPU_A_LDVPMD_IN:
331
        case V3D_QPU_A_LDVPMG_IN:
332
        case V3D_QPU_A_LDVPMP:
333
                if (!separate_vpm_segment)
334
                        add_write_dep(state, &state->last_vpm, n);
335
                break;
336

337
        case V3D_QPU_A_VPMWT:
338
                add_read_dep(state, state->last_vpm, n);
339
                break;
340

341
        case V3D_QPU_A_MSF:
342
                add_read_dep(state, state->last_tlb, n);
343
                break;
344

345
        case V3D_QPU_A_SETMSF:
346
        case V3D_QPU_A_SETREVF:
347
                add_write_dep(state, &state->last_tlb, n);
348
                break;
349

350
        default:
351
                break;
352
        }
353

354
        switch (inst->alu.mul.op) {
355
        case V3D_QPU_M_MULTOP:
356
        case V3D_QPU_M_UMUL24:
357
                /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358
                 * resets it to 0.  We could possibly reorder umul24s relative
359
                 * to each other, but for now just keep all the MUL parts in
360
                 * order.
361
                 */
362
                add_write_dep(state, &state->last_rtop, n);
363
                break;
364
        default:
365
                break;
366
        }
367

368
        if (inst->alu.add.op != V3D_QPU_A_NOP) {
369
                process_waddr_deps(state, n, inst->alu.add.waddr,
370
                                   inst->alu.add.magic_write);
371
        }
372
        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373
                process_waddr_deps(state, n, inst->alu.mul.waddr,
374
                                   inst->alu.mul.magic_write);
375
        }
376
        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377
                process_waddr_deps(state, n, inst->sig_addr,
378
                                   inst->sig_magic);
379
        }
380

381
        if (v3d_qpu_writes_r3(devinfo, inst))
382
                add_write_dep(state, &state->last_r[3], n);
383
        if (v3d_qpu_writes_r4(devinfo, inst))
384
                add_write_dep(state, &state->last_r[4], n);
385
        if (v3d_qpu_writes_r5(devinfo, inst))
386
                add_write_dep(state, &state->last_r[5], n);
387

388
        /* If we add any more dependencies here we should consider whether we
389
         * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
390
         */
391
        if (inst->sig.thrsw) {
392
                /* All accumulator contents and flags are undefined after the
393
                 * switch.
394
                 */
395
                for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
396
                        add_write_dep(state, &state->last_r[i], n);
397
                add_write_dep(state, &state->last_sf, n);
398
                add_write_dep(state, &state->last_rtop, n);
399

400
                /* Scoreboard-locking operations have to stay after the last
401
                 * thread switch.
402
                 */
403
                add_write_dep(state, &state->last_tlb, n);
404

405
                add_write_dep(state, &state->last_tmu_write, n);
406
                add_write_dep(state, &state->last_tmu_config, n);
407
        }
408

409
        if (v3d_qpu_waits_on_tmu(inst)) {
410
                /* TMU loads are coming from a FIFO, so ordering is important.
411
                 */
412
                add_write_dep(state, &state->last_tmu_read, n);
413
                /* Keep TMU loads after their TMU lookup terminator */
414
                add_read_dep(state, state->last_tmu_config, n);
415
        }
416

417
        /* Allow wrtmuc to be reordered with other instructions in the
418
         * same TMU sequence by using a read dependency on the last TMU
419
         * sequence terminator.
420
         */
421
        if (inst->sig.wrtmuc)
422
                add_read_dep(state, state->last_tmu_config, n);
423

424
        if (inst->sig.ldtlb | inst->sig.ldtlbu)
425
                add_write_dep(state, &state->last_tlb, n);
426

427
        if (inst->sig.ldvpm) {
428
                add_write_dep(state, &state->last_vpm_read, n);
429

430
                /* At least for now, we're doing shared I/O segments, so queue
431
                 * all writes after all reads.
432
                 */
433
                if (!separate_vpm_segment)
434
                        add_write_dep(state, &state->last_vpm, n);
435
        }
436

437
        /* inst->sig.ldunif or sideband uniform read */
438
        if (vir_has_uniform(qinst))
439
                add_write_dep(state, &state->last_unif, n);
440

441
        /* Both unifa and ldunifa must preserve ordering */
442
        if (inst->sig.ldunifa || inst->sig.ldunifarf)
443
                add_write_dep(state, &state->last_unifa, n);
444

445
        if (v3d_qpu_reads_flags(inst))
446
                add_read_dep(state, state->last_sf, n);
447
        if (v3d_qpu_writes_flags(inst))
448
                add_write_dep(state, &state->last_sf, n);
449
}
450

451
static void
452
calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
453
                       struct list_head *schedule_list)
454
{
455
        struct schedule_state state;
456

457
        memset(&state, 0, sizeof(state));
458
        state.dag = dag;
459
        state.devinfo = c->devinfo;
460
        state.dir = F;
461

462
        list_for_each_entry(struct schedule_node, node, schedule_list, link)
463
                calculate_deps(&state, node);
464
}
465

466
static void
467
calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
468
                       struct list_head *schedule_list)
469
{
470
        struct schedule_state state;
471

472
        memset(&state, 0, sizeof(state));
473
        state.dag = dag;
474
        state.devinfo = c->devinfo;
475
        state.dir = R;
476

477
        list_for_each_entry_rev(struct schedule_node, node, schedule_list,
478
                                link) {
479
                calculate_deps(&state, (struct schedule_node *)node);
480
        }
481
}
482

483
struct choose_scoreboard {
484
        struct dag *dag;
485
        int tick;
486
        int last_magic_sfu_write_tick;
487
        int last_stallable_sfu_reg;
488
        int last_stallable_sfu_tick;
489
        int last_ldvary_tick;
490
        int last_unifa_write_tick;
491
        int last_uniforms_reset_tick;
492
        int last_thrsw_tick;
493
        int last_branch_tick;
494
        int last_setmsf_tick;
495
        bool tlb_locked;
496
        bool fixup_ldvary;
497
        int ldvary_count;
498
};
499

500
static bool
501
mux_reads_too_soon(struct choose_scoreboard *scoreboard,
502
                   const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
503
{
504
        switch (mux) {
505
        case V3D_QPU_MUX_R4:
506
                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
507
                        return true;
508
                break;
509

510
        case V3D_QPU_MUX_R5:
511
                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
512
                        return true;
513
                break;
514
        default:
515
                break;
516
        }
517

518
        return false;
519
}
520

521
static bool
522
reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
523
                           struct qinst *qinst)
524
{
525
        const struct v3d_qpu_instr *inst = &qinst->qpu;
526

527
        /* XXX: Branching off of raddr. */
528
        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
529
                return false;
530

531
        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
532

533
        if (inst->alu.add.op != V3D_QPU_A_NOP) {
534
                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
535
                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
536
                        return true;
537
                }
538
                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
539
                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
540
                        return true;
541
                }
542
        }
543

544
        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
545
                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
546
                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
547
                        return true;
548
                }
549
                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
550
                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
551
                        return true;
552
                }
553
        }
554

555
        /* XXX: imm */
556

557
        return false;
558
}
559

560
static bool
561
writes_too_soon_after_write(const struct v3d_device_info *devinfo,
562
                            struct choose_scoreboard *scoreboard,
563
                            struct qinst *qinst)
564
{
565
        const struct v3d_qpu_instr *inst = &qinst->qpu;
566

567
        /* Don't schedule any other r4 write too soon after an SFU write.
568
         * This would normally be prevented by dependency tracking, but might
569
         * occur if a dead SFU computation makes it to scheduling.
570
         */
571
        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
572
            v3d_qpu_writes_r4(devinfo, inst))
573
                return true;
574

575
        return false;
576
}
577

578
static bool
579
pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
580
                          const struct v3d_qpu_instr *inst)
581
{
582
        return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
583
}
584

585
static bool
586
qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
587
                        uint32_t waddr) {
588

589
        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
590
           return false;
591

592
        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
593
            inst->raddr_a == waddr)
594
              return true;
595

596
        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
597
            !inst->sig.small_imm && (inst->raddr_b == waddr))
598
              return true;
599

600
        return false;
601
}
602

603
static bool
604
mux_read_stalls(struct choose_scoreboard *scoreboard,
605
                const struct v3d_qpu_instr *inst)
606
{
607
        return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
608
                qpu_instruction_uses_rf(inst,
609
                                        scoreboard->last_stallable_sfu_reg);
610
}
611

612
/* We define a max schedule priority to allow negative priorities as result of
613
 * substracting this max when an instruction stalls. So instructions that
614
 * stall have lower priority than regular instructions. */
615
#define MAX_SCHEDULE_PRIORITY 16
616

617
static int
618
get_instruction_priority(const struct v3d_device_info *devinfo,
619
                         const struct v3d_qpu_instr *inst)
620
{
621
        uint32_t baseline_score;
622
        uint32_t next_score = 0;
623

624
        /* Schedule TLB operations as late as possible, to get more
625
         * parallelism between shaders.
626
         */
627
        if (qpu_inst_is_tlb(inst))
628
                return next_score;
629
        next_score++;
630

631
        /* Schedule texture read results collection late to hide latency. */
632
        if (v3d_qpu_waits_on_tmu(inst))
633
                return next_score;
634
        next_score++;
635

636
        /* Default score for things that aren't otherwise special. */
637
        baseline_score = next_score;
638
        next_score++;
639

640
        /* Schedule texture read setup early to hide their latency better. */
641
        if (v3d_qpu_writes_tmu(devinfo, inst))
642
                return next_score;
643
        next_score++;
644

645
        /* We should increase the maximum if we assert here */
646
        assert(next_score < MAX_SCHEDULE_PRIORITY);
647

648
        return baseline_score;
649
}
650

651
static bool
652
qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
653
                          enum v3d_qpu_waddr waddr)
654
{
655
        return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
656
                v3d_qpu_magic_waddr_is_sfu(waddr) ||
657
                v3d_qpu_magic_waddr_is_tlb(waddr) ||
658
                v3d_qpu_magic_waddr_is_vpm(waddr) ||
659
                v3d_qpu_magic_waddr_is_tsy(waddr));
660
}
661

662
static bool
663
qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
664
                        const struct v3d_qpu_instr *inst)
665
{
666
        if (v3d_qpu_uses_vpm(inst))
667
                return true;
668
        if (v3d_qpu_uses_sfu(inst))
669
                return true;
670

671
        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
672
                if (inst->alu.add.op != V3D_QPU_A_NOP &&
673
                    inst->alu.add.magic_write &&
674
                    qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
675
                        return true;
676
                }
677

678
                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
679
                        return true;
680

681
                if (inst->alu.mul.op != V3D_QPU_M_NOP &&
682
                    inst->alu.mul.magic_write &&
683
                    qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
684
                        return true;
685
                }
686
        }
687

688
        return (inst->sig.ldvpm ||
689
                inst->sig.ldtmu ||
690
                inst->sig.ldtlb ||
691
                inst->sig.ldtlbu ||
692
                inst->sig.wrtmuc);
693
}
694

695
static bool
696
qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
697
                                 const struct v3d_qpu_instr *a,
698
                                 const struct v3d_qpu_instr *b)
699
{
700
        const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
701
        const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
702

703
        /* We can always do one peripheral access per instruction. */
704
        if (!a_uses_peripheral || !b_uses_peripheral)
705
                return true;
706

707
        if (devinfo->ver < 41)
708
                return false;
709

710
        /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
711
         * WRTMUC with a TMU magic register write (other than tmuc).
712
         */
713
        if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
714
            (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
715
                return true;
716
        }
717

718
        if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
719
            (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
720
                return true;
721
        }
722

723
        return false;
724
}
725

726
/* Compute a bitmask of which rf registers are used between
727
 * the two instructions.
728
 */
729
static uint64_t
730
qpu_raddrs_used(const struct v3d_qpu_instr *a,
731
                const struct v3d_qpu_instr *b)
732
{
733
        assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
734
        assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
735

736
        uint64_t raddrs_used = 0;
737
        if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
738
                raddrs_used |= (1ll << a->raddr_a);
739
        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
740
                raddrs_used |= (1ll << a->raddr_b);
741
        if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
742
                raddrs_used |= (1ll << b->raddr_a);
743
        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
744
                raddrs_used |= (1ll << b->raddr_b);
745

746
        return raddrs_used;
747
}
748

749
/* Take two instructions and attempt to merge their raddr fields
750
 * into one merged instruction. Returns false if the two instructions
751
 * access more than two different rf registers between them, or more
752
 * than one rf register and one small immediate.
753
 */
754
static bool
755
qpu_merge_raddrs(struct v3d_qpu_instr *result,
756
                 const struct v3d_qpu_instr *add_instr,
757
                 const struct v3d_qpu_instr *mul_instr)
758
{
759
        uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
760
        int naddrs = util_bitcount64(raddrs_used);
761

762
        if (naddrs > 2)
763
                return false;
764

765
        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
766
                if (naddrs > 1)
767
                        return false;
768

769
                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
770
                        if (add_instr->raddr_b != mul_instr->raddr_b)
771
                                return false;
772

773
                result->sig.small_imm = true;
774
                result->raddr_b = add_instr->sig.small_imm ?
775
                        add_instr->raddr_b : mul_instr->raddr_b;
776
        }
777

778
        if (naddrs == 0)
779
                return true;
780

781
        int raddr_a = ffsll(raddrs_used) - 1;
782
        raddrs_used &= ~(1ll << raddr_a);
783
        result->raddr_a = raddr_a;
784

785
        if (!result->sig.small_imm) {
786
                if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
787
                    raddr_a == add_instr->raddr_b) {
788
                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
789
                                result->alu.add.a = V3D_QPU_MUX_A;
790
                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
791
                            v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
792
                                result->alu.add.b = V3D_QPU_MUX_A;
793
                        }
794
                }
795
                if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
796
                    raddr_a == mul_instr->raddr_b) {
797
                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
798
                                result->alu.mul.a = V3D_QPU_MUX_A;
799
                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
800
                            v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
801
                                result->alu.mul.b = V3D_QPU_MUX_A;
802
                        }
803
                }
804
        }
805
        if (!raddrs_used)
806
                return true;
807

808
        int raddr_b = ffsll(raddrs_used) - 1;
809
        result->raddr_b = raddr_b;
810
        if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
811
            raddr_b == add_instr->raddr_a) {
812
                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
813
                        result->alu.add.a = V3D_QPU_MUX_B;
814
                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
815
                    v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
816
                        result->alu.add.b = V3D_QPU_MUX_B;
817
                }
818
        }
819
        if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
820
            raddr_b == mul_instr->raddr_a) {
821
                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
822
                        result->alu.mul.a = V3D_QPU_MUX_B;
823
                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
824
                    v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
825
                        result->alu.mul.b = V3D_QPU_MUX_B;
826
                }
827
        }
828

829
        return true;
830
}
831

832
static bool
833
can_do_add_as_mul(enum v3d_qpu_add_op op)
834
{
835
        switch (op) {
836
        case V3D_QPU_A_ADD:
837
        case V3D_QPU_A_SUB:
838
                return true;
839
        default:
840
                return false;
841
        }
842
}
843

844
static enum v3d_qpu_mul_op
845
add_op_as_mul_op(enum v3d_qpu_add_op op)
846
{
847
        switch (op) {
848
        case V3D_QPU_A_ADD:
849
                return V3D_QPU_M_ADD;
850
        case V3D_QPU_A_SUB:
851
                return V3D_QPU_M_SUB;
852
        default:
853
                unreachable("unexpected add opcode");
854
        }
855
}
856

857
static void
858
qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
859
{
860
        STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
861
        assert(inst->alu.add.op != V3D_QPU_A_NOP);
862
        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
863

864
        memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
865
        inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
866
        inst->alu.add.op = V3D_QPU_A_NOP;
867

868
        inst->flags.mc = inst->flags.ac;
869
        inst->flags.mpf = inst->flags.apf;
870
        inst->flags.muf = inst->flags.auf;
871
        inst->flags.ac = V3D_QPU_COND_NONE;
872
        inst->flags.apf = V3D_QPU_PF_NONE;
873
        inst->flags.auf = V3D_QPU_UF_NONE;
874
}
875

876
static bool
877
qpu_merge_inst(const struct v3d_device_info *devinfo,
878
               struct v3d_qpu_instr *result,
879
               const struct v3d_qpu_instr *a,
880
               const struct v3d_qpu_instr *b)
881
{
882
        if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
883
            b->type != V3D_QPU_INSTR_TYPE_ALU) {
884
                return false;
885
        }
886

887
        if (!qpu_compatible_peripheral_access(devinfo, a, b))
888
                return false;
889

890
        struct v3d_qpu_instr merge = *a;
891
        const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
892

893
        struct v3d_qpu_instr mul_inst;
894
        if (b->alu.add.op != V3D_QPU_A_NOP) {
895
                if (a->alu.add.op == V3D_QPU_A_NOP) {
896
                        merge.alu.add = b->alu.add;
897

898
                        merge.flags.ac = b->flags.ac;
899
                        merge.flags.apf = b->flags.apf;
900
                        merge.flags.auf = b->flags.auf;
901

902
                        add_instr = b;
903
                        mul_instr = a;
904
                }
905
                /* If a's add op is used but its mul op is not, then see if we
906
                 * can convert either a's add op or b's add op to a mul op
907
                 * so we can merge.
908
                 */
909
                else if (a->alu.mul.op == V3D_QPU_M_NOP &&
910
                         can_do_add_as_mul(b->alu.add.op)) {
911
                        mul_inst = *b;
912
                        qpu_convert_add_to_mul(&mul_inst);
913

914
                        merge.alu.mul = mul_inst.alu.mul;
915

916
                        merge.flags.mc = b->flags.ac;
917
                        merge.flags.mpf = b->flags.apf;
918
                        merge.flags.muf = b->flags.auf;
919

920
                        add_instr = a;
921
                        mul_instr = &mul_inst;
922
                } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
923
                           can_do_add_as_mul(a->alu.add.op)) {
924
                        mul_inst = *a;
925
                        qpu_convert_add_to_mul(&mul_inst);
926

927
                        merge = mul_inst;
928
                        merge.alu.add = b->alu.add;
929

930
                        merge.flags.ac = b->flags.ac;
931
                        merge.flags.apf = b->flags.apf;
932
                        merge.flags.auf = b->flags.auf;
933

934
                        add_instr = b;
935
                        mul_instr = &mul_inst;
936
                } else {
937
                        return false;
938
                }
939
        }
940

941
        if (b->alu.mul.op != V3D_QPU_M_NOP) {
942
                if (a->alu.mul.op != V3D_QPU_M_NOP)
943
                        return false;
944
                merge.alu.mul = b->alu.mul;
945

946
                merge.flags.mc = b->flags.mc;
947
                merge.flags.mpf = b->flags.mpf;
948
                merge.flags.muf = b->flags.muf;
949

950
                mul_instr = b;
951
                add_instr = a;
952
        }
953

954
        if (add_instr && mul_instr &&
955
            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
956
                        return false;
957
        }
958

959
        merge.sig.thrsw |= b->sig.thrsw;
960
        merge.sig.ldunif |= b->sig.ldunif;
961
        merge.sig.ldunifrf |= b->sig.ldunifrf;
962
        merge.sig.ldunifa |= b->sig.ldunifa;
963
        merge.sig.ldunifarf |= b->sig.ldunifarf;
964
        merge.sig.ldtmu |= b->sig.ldtmu;
965
        merge.sig.ldvary |= b->sig.ldvary;
966
        merge.sig.ldvpm |= b->sig.ldvpm;
967
        merge.sig.small_imm |= b->sig.small_imm;
968
        merge.sig.ldtlb |= b->sig.ldtlb;
969
        merge.sig.ldtlbu |= b->sig.ldtlbu;
970
        merge.sig.ucb |= b->sig.ucb;
971
        merge.sig.rotate |= b->sig.rotate;
972
        merge.sig.wrtmuc |= b->sig.wrtmuc;
973

974
        if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
975
            v3d_qpu_sig_writes_address(devinfo, &b->sig))
976
                return false;
977
        merge.sig_addr |= b->sig_addr;
978
        merge.sig_magic |= b->sig_magic;
979

980
        uint64_t packed;
981
        bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
982

983
        *result = merge;
984
        /* No modifying the real instructions on failure. */
985
        assert(ok || (a != result && b != result));
986

987
        return ok;
988
}
989

990
static inline bool
991
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
992
{
993
        return inst->sig.ldunif || inst->sig.ldunifrf;
994
}
995

996
static bool
997
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
998
                                         struct choose_scoreboard *scoreboard,
999
                                         const struct qinst *qinst);
1000

1001
static struct schedule_node *
1002
choose_instruction_to_schedule(struct v3d_compile *c,
1003
                               struct choose_scoreboard *scoreboard,
1004
                               struct schedule_node *prev_inst)
1005
{
1006
        struct schedule_node *chosen = NULL;
1007
        int chosen_prio = 0;
1008

1009
        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1010
         * will handle pairing it along with filling the delay slots.
1011
         */
1012
        if (prev_inst) {
1013
                if (prev_inst->inst->qpu.sig.thrsw)
1014
                        return NULL;
1015
        }
1016

1017
        bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1018
                                 scoreboard->ldvary_count < c->num_inputs;
1019
        bool skipped_insts_for_ldvary_pipelining = false;
1020
retry:
1021
        list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1022
                            dag.link) {
1023
                const struct v3d_qpu_instr *inst = &n->inst->qpu;
1024

1025
                if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1026
                        skipped_insts_for_ldvary_pipelining = true;
1027
                        continue;
1028
                }
1029

1030
                /* Don't choose the branch instruction until it's the last one
1031
                 * left.  We'll move it up to fit its delay slots after we
1032
                 * choose it.
1033
                 */
1034
                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1035
                    !list_is_singular(&scoreboard->dag->heads)) {
1036
                        continue;
1037
                }
1038

1039
                /* We need to have 3 delay slots between a write to unifa and
1040
                 * a follow-up ldunifa.
1041
                 */
1042
                if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1043
                    scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1044
                        continue;
1045

1046
                /* "An instruction must not read from a location in physical
1047
                 *  regfile A or B that was written to by the previous
1048
                 *  instruction."
1049
                 */
1050
                if (reads_too_soon_after_write(scoreboard, n->inst))
1051
                        continue;
1052

1053
                if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1054
                        continue;
1055

1056
                /* "A scoreboard wait must not occur in the first two
1057
                 *  instructions of a fragment shader. This is either the
1058
                 *  explicit Wait for Scoreboard signal or an implicit wait
1059
                 *  with the first tile-buffer read or write instruction."
1060
                 */
1061
                if (pixel_scoreboard_too_soon(scoreboard, inst))
1062
                        continue;
1063

1064
                /* ldunif and ldvary both write r5, but ldunif does so a tick
1065
                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
1066
                 * otherwise get scheduled so ldunif and ldvary try to update
1067
                 * r5 in the same tick.
1068
                 */
1069
                if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1070
                    scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1071
                        continue;
1072
                }
1073

1074
                /* If we are in a thrsw delay slot check that this instruction
1075
                 * is valid for that.
1076
                 */
1077
                if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1078
                    !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1079
                                                              n->inst)) {
1080
                        continue;
1081
                }
1082

1083
                if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1084
                        /* Don't try to put a branch in the delay slots of another
1085
                         * branch or a unifa write.
1086
                         */
1087
                        if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1088
                                continue;
1089
                        if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1090
                                continue;
1091

1092
                        /* No branch with cond != 0,2,3 and msfign != 0 after
1093
                         * setmsf.
1094
                         */
1095
                        if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1096
                            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1097
                            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1098
                            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1099
                            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1100
                                continue;
1101
                        }
1102
                }
1103

1104
                /* If we're trying to pair with another instruction, check
1105
                 * that they're compatible.
1106
                 */
1107
                if (prev_inst) {
1108
                        /* Don't pair up a thread switch signal -- we'll
1109
                         * handle pairing it when we pick it on its own.
1110
                         */
1111
                        if (inst->sig.thrsw)
1112
                                continue;
1113

1114
                        if (prev_inst->inst->uniform != -1 &&
1115
                            n->inst->uniform != -1)
1116
                                continue;
1117

1118
                       /* Simulator complains if we have two uniforms loaded in
1119
                        * the the same instruction, which could happen if we
1120
                        * have a ldunif or sideband uniform and we pair that
1121
                        * with ldunifa.
1122
                        */
1123
                        if (vir_has_uniform(prev_inst->inst) &&
1124
                            (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1125
                                continue;
1126
                        }
1127

1128
                        if ((prev_inst->inst->qpu.sig.ldunifa ||
1129
                             prev_inst->inst->qpu.sig.ldunifarf) &&
1130
                            vir_has_uniform(n->inst)) {
1131
                                continue;
1132
                        }
1133

1134
                        /* Don't merge in something that will lock the TLB.
1135
                         * Hopwefully what we have in inst will release some
1136
                         * other instructions, allowing us to delay the
1137
                         * TLB-locking instruction until later.
1138
                         */
1139
                        if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
1140
                                continue;
1141

1142
                        /* When we succesfully pair up an ldvary we then try
1143
                         * to merge it into the previous instruction if
1144
                         * possible to improve pipelining. Don't pick up the
1145
                         * ldvary now if the follow-up fixup would place
1146
                         * it in the delay slots of a thrsw, which is not
1147
                         * allowed and would prevent the fixup from being
1148
                         * successul.
1149
                         */
1150
                        if (inst->sig.ldvary &&
1151
                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
1152
                                continue;
1153
                        }
1154

1155
                        struct v3d_qpu_instr merged_inst;
1156
                        if (!qpu_merge_inst(c->devinfo, &merged_inst,
1157
                                            &prev_inst->inst->qpu, inst)) {
1158
                                continue;
1159
                        }
1160
                }
1161

1162
                int prio = get_instruction_priority(c->devinfo, inst);
1163

1164
                if (mux_read_stalls(scoreboard, inst)) {
1165
                        /* Don't merge an instruction that stalls */
1166
                        if (prev_inst)
1167
                                continue;
1168
                        else {
1169
                                /* Any instruction that don't stall will have
1170
                                 * higher scheduling priority */
1171
                                prio -= MAX_SCHEDULE_PRIORITY;
1172
                                assert(prio < 0);
1173
                        }
1174
                }
1175

1176
                /* Found a valid instruction.  If nothing better comes along,
1177
                 * this one works.
1178
                 */
1179
                if (!chosen) {
1180
                        chosen = n;
1181
                        chosen_prio = prio;
1182
                        continue;
1183
                }
1184

1185
                if (prio > chosen_prio) {
1186
                        chosen = n;
1187
                        chosen_prio = prio;
1188
                } else if (prio < chosen_prio) {
1189
                        continue;
1190
                }
1191

1192
                if (n->delay > chosen->delay) {
1193
                        chosen = n;
1194
                        chosen_prio = prio;
1195
                } else if (n->delay < chosen->delay) {
1196
                        continue;
1197
                }
1198
        }
1199

1200
        /* If we did not find any instruction to schedule but we discarded
1201
         * some of them to prioritize ldvary pipelining, try again.
1202
         */
1203
        if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1204
                skipped_insts_for_ldvary_pipelining = false;
1205
                ldvary_pipelining = false;
1206
                goto retry;
1207
        }
1208

1209
        if (chosen && chosen->inst->qpu.sig.ldvary) {
1210
                scoreboard->ldvary_count++;
1211
                /* If we are pairing an ldvary, flag it so we can fix it up for
1212
                 * optimal pipelining of ldvary sequences.
1213
                 */
1214
                if (prev_inst)
1215
                        scoreboard->fixup_ldvary = true;
1216
        }
1217

1218
        return chosen;
1219
}
1220

1221
static void
1222
update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1223
                                  enum v3d_qpu_waddr waddr,
1224
                                  const struct v3d_device_info *devinfo)
1225
{
1226
        if (v3d_qpu_magic_waddr_is_sfu(waddr))
1227
                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1228
        else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
1229
                scoreboard->last_unifa_write_tick = scoreboard->tick;
1230
}
1231

1232
static void
1233
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1234
                                      const struct v3d_qpu_instr *inst)
1235
{
1236
        if (v3d_qpu_instr_is_sfu(inst)) {
1237
                scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1238
                scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1239
        }
1240
}
1241

1242
static void
1243
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1244
                             const struct v3d_qpu_instr *inst,
1245
                             const struct v3d_device_info *devinfo)
1246
{
1247
        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1248
                return;
1249

1250
        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1251

1252
        if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1253
                if (inst->alu.add.magic_write) {
1254
                        update_scoreboard_for_magic_waddr(scoreboard,
1255
                                                          inst->alu.add.waddr,
1256
                                                          devinfo);
1257
                } else {
1258
                        update_scoreboard_for_sfu_stall_waddr(scoreboard,
1259
                                                              inst);
1260
                }
1261

1262
                if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1263
                        scoreboard->last_setmsf_tick = scoreboard->tick;
1264
        }
1265

1266
        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1267
                if (inst->alu.mul.magic_write) {
1268
                        update_scoreboard_for_magic_waddr(scoreboard,
1269
                                                          inst->alu.mul.waddr,
1270
                                                          devinfo);
1271
                }
1272
        }
1273

1274
        if (inst->sig.ldvary)
1275
                scoreboard->last_ldvary_tick = scoreboard->tick;
1276

1277
        if (qpu_inst_is_tlb(inst))
1278
                scoreboard->tlb_locked = true;
1279
}
1280

1281
static void
1282
dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1283
{
1284
        list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1285
                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1286
                v3d_qpu_dump(devinfo, &n->inst->qpu);
1287
                fprintf(stderr, "\n");
1288

1289
                util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1290
                        struct schedule_node *child =
1291
                                (struct schedule_node *)edge->child;
1292
                        if (!child)
1293
                                continue;
1294

1295
                        fprintf(stderr, "                 - ");
1296
                        v3d_qpu_dump(devinfo, &child->inst->qpu);
1297
                        fprintf(stderr, " (%d parents, %c)\n",
1298
                                child->dag.parent_count,
1299
                                edge->data ? 'w' : 'r');
1300
                }
1301
        }
1302
}
1303

1304
static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1305
                                    enum v3d_qpu_waddr waddr,
1306
                                    const struct v3d_qpu_instr *after)
1307
{
1308
        /* Apply some huge latency between texture fetch requests and getting
1309
         * their results back.
1310
         *
1311
         * FIXME: This is actually pretty bogus.  If we do:
1312
         *
1313
         * mov tmu0_s, a
1314
         * <a bit of math>
1315
         * mov tmu0_s, b
1316
         * load_tmu0
1317
         * <more math>
1318
         * load_tmu0
1319
         *
1320
         * we count that as worse than
1321
         *
1322
         * mov tmu0_s, a
1323
         * mov tmu0_s, b
1324
         * <lots of math>
1325
         * load_tmu0
1326
         * <more math>
1327
         * load_tmu0
1328
         *
1329
         * because we associate the first load_tmu0 with the *second* tmu0_s.
1330
         */
1331
        if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1332
            v3d_qpu_waits_on_tmu(after)) {
1333
                return 100;
1334
        }
1335

1336
        /* Assume that anything depending on us is consuming the SFU result. */
1337
        if (v3d_qpu_magic_waddr_is_sfu(waddr))
1338
                return 3;
1339

1340
        return 1;
1341
}
1342

1343
static uint32_t
1344
instruction_latency(const struct v3d_device_info *devinfo,
1345
                    struct schedule_node *before, struct schedule_node *after)
1346
{
1347
        const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1348
        const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1349
        uint32_t latency = 1;
1350

1351
        if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1352
            after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1353
                return latency;
1354

1355
        if (before_inst->alu.add.magic_write) {
1356
                latency = MAX2(latency,
1357
                               magic_waddr_latency(devinfo,
1358
                                                   before_inst->alu.add.waddr,
1359
                                                   after_inst));
1360
        }
1361

1362
        if (before_inst->alu.mul.magic_write) {
1363
                latency = MAX2(latency,
1364
                               magic_waddr_latency(devinfo,
1365
                                                   before_inst->alu.mul.waddr,
1366
                                                   after_inst));
1367
        }
1368

1369
        if (v3d_qpu_instr_is_sfu(before_inst))
1370
                return 2;
1371

1372
        return latency;
1373
}
1374

1375
/** Recursive computation of the delay member of a node. */
1376
static void
1377
compute_delay(struct dag_node *node, void *state)
1378
{
1379
        struct schedule_node *n = (struct schedule_node *)node;
1380
        struct v3d_compile *c = (struct v3d_compile *) state;
1381

1382
        n->delay = 1;
1383

1384
        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1385
                struct schedule_node *child =
1386
                        (struct schedule_node *)edge->child;
1387

1388
                n->delay = MAX2(n->delay, (child->delay +
1389
                                           instruction_latency(c->devinfo, n,
1390
                                                               child)));
1391
        }
1392
}
1393

1394
/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1395
 * should be called on it later to finish pruning the other edges).
1396
 */
1397
static void
1398
pre_remove_head(struct dag *dag, struct schedule_node *n)
1399
{
1400
        list_delinit(&n->dag.link);
1401

1402
        util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1403
                if (edge->data)
1404
                        dag_remove_edge(dag, edge);
1405
        }
1406
}
1407

1408
static void
1409
mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1410
                           struct dag *dag,
1411
                           uint32_t time,
1412
                           struct schedule_node *node)
1413
{
1414
        if (!node)
1415
                return;
1416

1417
        util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1418
                struct schedule_node *child =
1419
                        (struct schedule_node *)edge->child;
1420

1421
                if (!child)
1422
                        continue;
1423

1424
                uint32_t latency = instruction_latency(devinfo, node, child);
1425

1426
                child->unblocked_time = MAX2(child->unblocked_time,
1427
                                             time + latency);
1428
        }
1429
        dag_prune_head(dag, &node->dag);
1430
}
1431

1432
static void
1433
insert_scheduled_instruction(struct v3d_compile *c,
1434
                             struct qblock *block,
1435
                             struct choose_scoreboard *scoreboard,
1436
                             struct qinst *inst)
1437
{
1438
        list_addtail(&inst->link, &block->instructions);
1439

1440
        update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
1441
        c->qpu_inst_count++;
1442
        scoreboard->tick++;
1443
}
1444

1445
static struct qinst *
1446
vir_nop()
1447
{
1448
        struct qreg undef = vir_nop_reg();
1449
        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1450

1451
        return qinst;
1452
}
1453

1454
static void
1455
emit_nop(struct v3d_compile *c, struct qblock *block,
1456
         struct choose_scoreboard *scoreboard)
1457
{
1458
        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1459
}
1460

1461
static bool
1462
qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1463
                              const struct qinst *qinst, int slot)
1464
{
1465
        const struct v3d_qpu_instr *inst = &qinst->qpu;
1466

1467
        /* Only TLB Z writes are prohibited in the last slot, but we don't
1468
         * have those flagged so prohibit all TLB ops for now.
1469
         */
1470
        if (slot == 2 && qpu_inst_is_tlb(inst))
1471
                return false;
1472

1473
        if (slot > 0 && qinst->uniform != ~0)
1474
                return false;
1475

1476
        if (v3d_qpu_uses_vpm(inst))
1477
                return false;
1478

1479
        if (inst->sig.ldvary)
1480
                return false;
1481

1482
        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1483
                /* GFXH-1625: TMUWT not allowed in the final instruction. */
1484
                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1485
                        return false;
1486

1487
                /* No writing physical registers at the end. */
1488
                if (!inst->alu.add.magic_write ||
1489
                    !inst->alu.mul.magic_write) {
1490
                        return false;
1491
                }
1492

1493
                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1494
                        return false;
1495

1496
                /* RF0-2 might be overwritten during the delay slots by
1497
                 * fragment shader setup.
1498
                 */
1499
                if (inst->raddr_a < 3 &&
1500
                    (inst->alu.add.a == V3D_QPU_MUX_A ||
1501
                     inst->alu.add.b == V3D_QPU_MUX_A ||
1502
                     inst->alu.mul.a == V3D_QPU_MUX_A ||
1503
                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
1504
                        return false;
1505
                }
1506

1507
                if (inst->raddr_b < 3 &&
1508
                    !inst->sig.small_imm &&
1509
                    (inst->alu.add.a == V3D_QPU_MUX_B ||
1510
                     inst->alu.add.b == V3D_QPU_MUX_B ||
1511
                     inst->alu.mul.a == V3D_QPU_MUX_B ||
1512
                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
1513
                        return false;
1514
                }
1515
        }
1516

1517
        return true;
1518
}
1519

1520
/**
1521
 * This is called when trying to merge a thrsw back into the instruction stream
1522
 * of instructions that were scheduled *before* the thrsw signal to fill its
1523
 * delay slots. Because the actual execution of the thrsw happens after the
1524
 * delay slots, it is usually safe to do this, but there are some cases that
1525
 * need special care.
1526
 */
1527
static bool
1528
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1529
                                          const struct qinst *qinst,
1530
                                          uint32_t slot)
1531
{
1532
        /* No scheduling SFU when the result would land in the other
1533
         * thread.  The simulator complains for safety, though it
1534
         * would only occur for dead code in our case.
1535
         */
1536
        if (slot > 0 &&
1537
            qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1538
            (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1539
             v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1540
                return false;
1541
        }
1542

1543
        if (slot > 0 && qinst->qpu.sig.ldvary)
1544
                return false;
1545

1546
        /* unifa and the following 3 instructions can't overlap a
1547
         * thread switch/end. The docs further clarify that this means
1548
         * the cycle at which the actual thread switch/end happens
1549
         * and not when the thrsw instruction is processed, which would
1550
         * be after the 2 delay slots following the thrsw instruction.
1551
         * This means that we can move up a thrsw up to the instruction
1552
         * right after unifa:
1553
         *
1554
         * unifa, r5
1555
         * thrsw
1556
         * delay slot 1
1557
         * delay slot 2
1558
         * Thread switch happens here, 4 instructions away from unifa
1559
         */
1560
        if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
1561
                return false;
1562

1563
        return true;
1564
}
1565

1566
/**
1567
 * This is called for instructions scheduled *after* a thrsw signal that may
1568
 * land in the delay slots of the thrsw. Because these instructions were
1569
 * scheduled after the thrsw, we need to be careful when placing them into
1570
 * the delay slots, since that means that we are moving them ahead of the
1571
 * thread switch and we need to ensure that is not a problem.
1572
 */
1573
static bool
1574
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1575
                                         struct choose_scoreboard *scoreboard,
1576
                                         const struct qinst *qinst)
1577
{
1578
        const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
1579
        assert(slot <= 2);
1580

1581
        /* We merge thrsw instructions back into the instruction stream
1582
         * manually, so any instructions scheduled after a thrsw shold be
1583
         * in the actual delay slots and not in the same slot as the thrsw.
1584
         */
1585
        assert(slot >= 1);
1586

1587
        /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
1588
        if (qinst->qpu.sig.thrsw)
1589
                return false;
1590

1591
        /* The restrictions for instructions scheduled before the the thrsw
1592
         * also apply to instructions scheduled after the thrsw that we want
1593
         * to place in its delay slots.
1594
         */
1595
        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1596
                return false;
1597

1598
        /* TLB access is disallowed until scoreboard wait is executed, which
1599
         * we do on the last thread switch.
1600
         */
1601
        if (qpu_inst_is_tlb(&qinst->qpu))
1602
                return false;
1603

1604
        /* Instruction sequence restrictions: Branch is not allowed in delay
1605
         * slots of a thrsw.
1606
         */
1607
        if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1608
                return false;
1609

1610
        /* Miscellaneous restrictions: At the point of a thrsw we need to have
1611
         * at least one outstanding lookup or TSY wait.
1612
         *
1613
         * So avoid placing TMU instructions scheduled after the thrsw into
1614
         * its delay slots or we may be compromising the integrity of our TMU
1615
         * sequences. Also, notice that if we moved these instructions into
1616
         * the delay slots of a previous thrsw we could overflow our TMU output
1617
         * fifo, since we could be effectively pipelining a lookup scheduled
1618
         * after the thrsw into the sequence before the thrsw.
1619
         */
1620
        if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
1621
            qinst->qpu.sig.wrtmuc) {
1622
                return false;
1623
        }
1624

1625
        /* Don't move instructions that wait on the TMU before the thread switch
1626
         * happens since that would make the current thread stall before the
1627
         * switch, which is exactly what we want to avoid with the thrsw
1628
         * instruction.
1629
         */
1630
        if (v3d_qpu_waits_on_tmu(&qinst->qpu))
1631
                return false;
1632

1633
        /* A thread switch invalidates all accumulators, so don't place any
1634
         * instructions that write accumulators into the delay slots.
1635
         */
1636
        if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
1637
                return false;
1638

1639
        /* Multop has an implicit write to the rtop register which is an
1640
         * specialized accumulator that is only used with this instruction.
1641
         */
1642
        if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
1643
                return false;
1644

1645
        /* Flags are invalidated across a thread switch, so dont' place
1646
         * instructions that write flags into delay slots.
1647
         */
1648
        if (v3d_qpu_writes_flags(&qinst->qpu))
1649
                return false;
1650

1651
        return true;
1652
}
1653

1654
static bool
1655
valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1656
                     struct qinst *qinst, int instructions_in_sequence,
1657
                     bool is_thrend)
1658
{
1659
        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1660
        if (scoreboard->last_thrsw_tick + 3 >
1661
            scoreboard->tick - instructions_in_sequence) {
1662
                return false;
1663
        }
1664

1665
        for (int slot = 0; slot < instructions_in_sequence; slot++) {
1666
                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1667
                        return false;
1668

1669
                if (is_thrend &&
1670
                    !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
1671
                        return false;
1672
                }
1673

1674
                /* Note that the list is circular, so we can only do this up
1675
                 * to instructions_in_sequence.
1676
                 */
1677
                qinst = (struct qinst *)qinst->link.next;
1678
        }
1679

1680
        return true;
1681
}
1682

1683
/**
1684
 * Emits a THRSW signal in the stream, trying to move it up to pair with
1685
 * another instruction.
1686
 */
1687
static int
1688
emit_thrsw(struct v3d_compile *c,
1689
           struct qblock *block,
1690
           struct choose_scoreboard *scoreboard,
1691
           struct qinst *inst,
1692
           bool is_thrend)
1693
{
1694
        int time = 0;
1695

1696
        /* There should be nothing in a thrsw inst being scheduled other than
1697
         * the signal bits.
1698
         */
1699
        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1700
        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1701
        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1702

1703
        /* Don't try to emit a thrsw in the delay slots of a previous thrsw
1704
         * or branch.
1705
         */
1706
        while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
1707
                emit_nop(c, block, scoreboard);
1708
                time++;
1709
        }
1710
        while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
1711
                emit_nop(c, block, scoreboard);
1712
                time++;
1713
        }
1714

1715
        /* Find how far back into previous instructions we can put the THRSW. */
1716
        int slots_filled = 0;
1717
        struct qinst *merge_inst = NULL;
1718
        vir_for_each_inst_rev(prev_inst, block) {
1719
                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1720
                sig.thrsw = true;
1721
                uint32_t packed_sig;
1722

1723
                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
1724
                        break;
1725

1726
                if (!valid_thrsw_sequence(c, scoreboard,
1727
                                          prev_inst, slots_filled + 1,
1728
                                          is_thrend)) {
1729
                        break;
1730
                }
1731

1732
                merge_inst = prev_inst;
1733
                if (++slots_filled == 3)
1734
                        break;
1735
        }
1736

1737
        bool needs_free = false;
1738
        if (merge_inst) {
1739
                merge_inst->qpu.sig.thrsw = true;
1740
                needs_free = true;
1741
                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1742
        } else {
1743
                scoreboard->last_thrsw_tick = scoreboard->tick;
1744
                insert_scheduled_instruction(c, block, scoreboard, inst);
1745
                time++;
1746
                slots_filled++;
1747
                merge_inst = inst;
1748
        }
1749

1750
        /* If we're emitting the last THRSW (other than program end), then
1751
         * signal that to the HW by emitting two THRSWs in a row.
1752
         */
1753
        if (inst->is_last_thrsw) {
1754
                if (slots_filled <= 1) {
1755
                        emit_nop(c, block, scoreboard);
1756
                        time++;
1757
                }
1758
                struct qinst *second_inst =
1759
                        (struct qinst *)merge_inst->link.next;
1760
                second_inst->qpu.sig.thrsw = true;
1761
        }
1762

1763
        /* Make sure the thread end executes within the program lifespan */
1764
        if (is_thrend) {
1765
                for (int i = 0; i < 3 - slots_filled; i++) {
1766
                        emit_nop(c, block, scoreboard);
1767
                        time++;
1768
                }
1769
        }
1770

1771
        /* If we put our THRSW into another instruction, free up the
1772
         * instruction that didn't end up scheduled into the list.
1773
         */
1774
        if (needs_free)
1775
                free(inst);
1776

1777
        return time;
1778
}
1779

1780
static bool
1781
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
1782
{
1783
        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1784
                return false;
1785

1786
        if (inst->qpu.sig.thrsw)
1787
                return false;
1788

1789
        if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
1790
                return false;
1791

1792
        if (vir_has_uniform(inst))
1793
                return false;
1794

1795
        return true;
1796
}
1797

1798
static void
1799
emit_branch(struct v3d_compile *c,
1800
           struct qblock *block,
1801
           struct choose_scoreboard *scoreboard,
1802
           struct qinst *inst)
1803
{
1804
        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1805

1806
        /* We should've not picked up a branch for the delay slots of a previous
1807
         * thrsw, branch or unifa write instruction.
1808
         */
1809
        int branch_tick = scoreboard->tick;
1810
        assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
1811
        assert(scoreboard->last_branch_tick + 3 < branch_tick);
1812
        assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
1813

1814
        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
1815
         * setmsf.
1816
         */
1817
        bool is_safe_msf_branch =
1818
                inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
1819
                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
1820
                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
1821
                inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
1822
        assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
1823
               is_safe_msf_branch);
1824

1825
        /* Insert the branch instruction */
1826
        insert_scheduled_instruction(c, block, scoreboard, inst);
1827

1828
        /* Now see if we can move the branch instruction back into the
1829
         * instruction stream to fill its delay slots
1830
         */
1831
        int slots_filled = 0;
1832
        while (slots_filled < 3 && block->instructions.next != &inst->link) {
1833
                struct qinst *prev_inst = (struct qinst *) inst->link.prev;
1834
                assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
1835

1836
                /* Can't move the branch instruction if that would place it
1837
                 * in the delay slots of other instructions.
1838
                 */
1839
                if (scoreboard->last_branch_tick + 3 >=
1840
                    branch_tick - slots_filled - 1) {
1841
                        break;
1842
                }
1843

1844
                if (scoreboard->last_thrsw_tick + 2 >=
1845
                    branch_tick - slots_filled - 1) {
1846
                        break;
1847
                }
1848

1849
                if (scoreboard->last_unifa_write_tick + 3 >=
1850
                    branch_tick - slots_filled - 1) {
1851
                        break;
1852
                }
1853

1854
                /* Can't move a conditional branch before the instruction
1855
                 * that writes the flags for its condition.
1856
                 */
1857
                if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
1858
                    inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
1859
                        break;
1860
                }
1861

1862
                if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
1863
                        break;
1864

1865
                if (!is_safe_msf_branch) {
1866
                        struct qinst *prev_prev_inst =
1867
                                (struct qinst *) prev_inst->link.prev;
1868
                        if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1869
                            prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
1870
                                break;
1871
                        }
1872
                }
1873

1874
                list_del(&prev_inst->link);
1875
                list_add(&prev_inst->link, &inst->link);
1876
                slots_filled++;
1877
        }
1878

1879
        block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
1880
        scoreboard->last_branch_tick = branch_tick - slots_filled;
1881

1882
        /* Fill any remaining delay slots.
1883
         *
1884
         * For unconditional branches we'll try to fill these with the
1885
         * first instructions in the successor block after scheduling
1886
         * all blocks when setting up branch targets.
1887
         */
1888
        for (int i = 0; i < 3 - slots_filled; i++)
1889
                emit_nop(c, block, scoreboard);
1890
}
1891

1892
static bool
1893
alu_reads_register(struct v3d_qpu_instr *inst,
1894
                   bool add, bool magic, uint32_t index)
1895
{
1896
        uint32_t num_src;
1897
        enum v3d_qpu_mux mux_a, mux_b;
1898

1899
        if (add) {
1900
                num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
1901
                mux_a = inst->alu.add.a;
1902
                mux_b = inst->alu.add.b;
1903
        } else {
1904
                num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
1905
                mux_a = inst->alu.mul.a;
1906
                mux_b = inst->alu.mul.b;
1907
        }
1908

1909
        for (int i = 0; i < num_src; i++) {
1910
                if (magic) {
1911
                        if (i == 0 && mux_a == index)
1912
                                return true;
1913
                        if (i == 1 && mux_b == index)
1914
                                return true;
1915
                } else {
1916
                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
1917
                            inst->raddr_a == index) {
1918
                                return true;
1919
                        }
1920
                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
1921
                            inst->raddr_b == index) {
1922
                                return true;
1923
                        }
1924
                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
1925
                            inst->raddr_a == index) {
1926
                                return true;
1927
                        }
1928
                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
1929
                            inst->raddr_b == index) {
1930
                                return true;
1931
                        }
1932
                }
1933
        }
1934

1935
        return false;
1936
}
1937

1938
/**
1939
 * This takes and ldvary signal merged into 'inst' and tries to move it up to
1940
 * the previous instruction to get good pipelining of ldvary sequences,
1941
 * transforming this:
1942
 *
1943
 * nop                  ; nop               ; ldvary.r4
1944
 * nop                  ; fmul  r0, r4, rf0 ;
1945
 * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
1946
 *
1947
 * into:
1948
 *
1949
 * nop                  ; nop               ; ldvary.r4
1950
 * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
1951
 * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
1952
 *
1953
 * If we manage to do this successfully (we return true here), then flagging
1954
 * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
1955
 * we will be able to pick up to merge into 'inst', leading to code like this:
1956
 *
1957
 * nop                  ; nop               ; ldvary.r4
1958
 * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
1959
 * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
1960
 */
1961
static bool
1962
fixup_pipelined_ldvary(struct v3d_compile *c,
1963
                       struct choose_scoreboard *scoreboard,
1964
                       struct qblock *block,
1965
                       struct v3d_qpu_instr *inst)
1966
{
1967
        /* We only call this if we have successfuly merged an ldvary into a
1968
         * previous instruction.
1969
         */
1970
        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1971
        assert(inst->sig.ldvary);
1972
        uint32_t ldvary_magic = inst->sig_magic;
1973
        uint32_t ldvary_index = inst->sig_addr;
1974

1975
        /* The instruction in which we merged the ldvary cannot read
1976
         * the ldvary destination, if it does, then moving the ldvary before
1977
         * it would overwrite it.
1978
         */
1979
        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
1980
                return false;
1981
        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
1982
                return false;
1983

1984
        /* The previous instruction can't write to the same destination as the
1985
         * ldvary.
1986
         */
1987
        struct qinst *prev = (struct qinst *) block->instructions.prev;
1988
        if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1989
                return false;
1990

1991
        if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
1992
                if (prev->qpu.alu.add.magic_write == ldvary_magic &&
1993
                    prev->qpu.alu.add.waddr == ldvary_index) {
1994
                        return false;
1995
                }
1996
        }
1997

1998
        if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
1999
                if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2000
                    prev->qpu.alu.mul.waddr == ldvary_index) {
2001
                        return false;
2002
                }
2003
        }
2004

2005
        /* The previous instruction cannot have a conflicting signal */
2006
        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
2007
                return false;
2008

2009
        /* The previous instruction cannot use flags since ldvary uses the
2010
         * 'cond' instruction field to store the destination.
2011
         */
2012
        if (v3d_qpu_writes_flags(&prev->qpu))
2013
                return false;
2014
        if (v3d_qpu_reads_flags(&prev->qpu))
2015
                return false;
2016

2017
        /* We can't put an ldvary in the delay slots of a thrsw. We should've
2018
         * prevented this when pairing up the ldvary with another instruction
2019
         * and flagging it for a fixup.
2020
         */
2021
        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
2022

2023
        /* Move the ldvary to the previous instruction and remove it from the
2024
         * current one.
2025
         */
2026
        prev->qpu.sig.ldvary = true;
2027
        prev->qpu.sig_magic = ldvary_magic;
2028
        prev->qpu.sig_addr = ldvary_index;
2029
        scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2030

2031
        inst->sig.ldvary = false;
2032
        inst->sig_magic = false;
2033
        inst->sig_addr = 0;
2034

2035
        /* By moving ldvary to the previous instruction we make it update
2036
         * r5 in the current one, so nothing else in it should write r5.
2037
         * This should've been prevented by our depedency tracking, which
2038
         * would not allow ldvary to be paired up with an instruction that
2039
         * writes r5 (since our dependency tracking doesn't know that the
2040
         * ldvary write r5 happens in the next instruction).
2041
         */
2042
        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
2043

2044
        return true;
2045
}
2046

2047
static uint32_t
2048
schedule_instructions(struct v3d_compile *c,
2049
                      struct choose_scoreboard *scoreboard,
2050
                      struct qblock *block,
2051
                      enum quniform_contents *orig_uniform_contents,
2052
                      uint32_t *orig_uniform_data,
2053
                      uint32_t *next_uniform)
2054
{
2055
        const struct v3d_device_info *devinfo = c->devinfo;
2056
        uint32_t time = 0;
2057

2058
        while (!list_is_empty(&scoreboard->dag->heads)) {
2059
                struct schedule_node *chosen =
2060
                        choose_instruction_to_schedule(c, scoreboard, NULL);
2061
                struct schedule_node *merge = NULL;
2062

2063
                /* If there are no valid instructions to schedule, drop a NOP
2064
                 * in.
2065
                 */
2066
                struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2067
                struct v3d_qpu_instr *inst = &qinst->qpu;
2068

2069
                if (debug) {
2070
                        fprintf(stderr, "t=%4d: current list:\n",
2071
                                time);
2072
                        dump_state(devinfo, scoreboard->dag);
2073
                        fprintf(stderr, "t=%4d: chose:   ", time);
2074
                        v3d_qpu_dump(devinfo, inst);
2075
                        fprintf(stderr, "\n");
2076
                }
2077

2078
                /* We can't mark_instruction_scheduled() the chosen inst until
2079
                 * we're done identifying instructions to merge, so put the
2080
                 * merged instructions on a list for a moment.
2081
                 */
2082
                struct list_head merged_list;
2083
                list_inithead(&merged_list);
2084

2085
                /* Schedule this instruction onto the QPU list. Also try to
2086
                 * find an instruction to pair with it.
2087
                 */
2088
                if (chosen) {
2089
                        time = MAX2(chosen->unblocked_time, time);
2090
                        pre_remove_head(scoreboard->dag, chosen);
2091

2092
                        while ((merge =
2093
                                choose_instruction_to_schedule(c, scoreboard,
2094
                                                               chosen))) {
2095
                                time = MAX2(merge->unblocked_time, time);
2096
                                pre_remove_head(scoreboard->dag, merge);
2097
                                list_addtail(&merge->link, &merged_list);
2098
                                (void)qpu_merge_inst(devinfo, inst,
2099
                                                     inst, &merge->inst->qpu);
2100
                                if (merge->inst->uniform != -1) {
2101
                                        chosen->inst->uniform =
2102
                                                merge->inst->uniform;
2103
                                }
2104

2105
                                if (debug) {
2106
                                        fprintf(stderr, "t=%4d: merging: ",
2107
                                                time);
2108
                                        v3d_qpu_dump(devinfo, &merge->inst->qpu);
2109
                                        fprintf(stderr, "\n");
2110
                                        fprintf(stderr, "         result: ");
2111
                                        v3d_qpu_dump(devinfo, inst);
2112
                                        fprintf(stderr, "\n");
2113
                                }
2114

2115
                                if (scoreboard->fixup_ldvary) {
2116
                                        scoreboard->fixup_ldvary = false;
2117
                                        if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2118
                                                /* Flag the ldvary as scheduled
2119
                                                 * now so we can try to merge the
2120
                                                 * follow-up instruction in the
2121
                                                 * the ldvary sequence into the
2122
                                                 * current instruction.
2123
                                                 */
2124
                                                mark_instruction_scheduled(
2125
                                                        devinfo, scoreboard->dag,
2126
                                                        time, merge);
2127
                                        }
2128
                                }
2129
                        }
2130
                        if (mux_read_stalls(scoreboard, inst))
2131
                                c->qpu_inst_stalled_count++;
2132
                }
2133

2134
                /* Update the uniform index for the rewritten location --
2135
                 * branch target updating will still need to change
2136
                 * c->uniform_data[] using this index.
2137
                 */
2138
                if (qinst->uniform != -1) {
2139
                        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2140
                                block->branch_uniform = *next_uniform;
2141

2142
                        c->uniform_data[*next_uniform] =
2143
                                orig_uniform_data[qinst->uniform];
2144
                        c->uniform_contents[*next_uniform] =
2145
                                orig_uniform_contents[qinst->uniform];
2146
                        qinst->uniform = *next_uniform;
2147
                        (*next_uniform)++;
2148
                }
2149

2150
                if (debug) {
2151
                        fprintf(stderr, "\n");
2152
                }
2153

2154
                /* Now that we've scheduled a new instruction, some of its
2155
                 * children can be promoted to the list of instructions ready to
2156
                 * be scheduled.  Update the children's unblocked time for this
2157
                 * DAG edge as we do so.
2158
                 */
2159
                mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2160
                list_for_each_entry(struct schedule_node, merge, &merged_list,
2161
                                    link) {
2162
                        mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2163

2164
                        /* The merged VIR instruction doesn't get re-added to the
2165
                         * block, so free it now.
2166
                         */
2167
                        free(merge->inst);
2168
                }
2169

2170
                if (inst->sig.thrsw) {
2171
                        time += emit_thrsw(c, block, scoreboard, qinst, false);
2172
                } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2173
                        emit_branch(c, block, scoreboard, qinst);
2174
                } else {
2175
                        insert_scheduled_instruction(c, block,
2176
                                                     scoreboard, qinst);
2177
                }
2178
        }
2179

2180
        return time;
2181
}
2182

2183
static uint32_t
2184
qpu_schedule_instructions_block(struct v3d_compile *c,
2185
                                struct choose_scoreboard *scoreboard,
2186
                                struct qblock *block,
2187
                                enum quniform_contents *orig_uniform_contents,
2188
                                uint32_t *orig_uniform_data,
2189
                                uint32_t *next_uniform)
2190
{
2191
        void *mem_ctx = ralloc_context(NULL);
2192
        scoreboard->dag = dag_create(mem_ctx);
2193
        struct list_head setup_list;
2194

2195
        list_inithead(&setup_list);
2196

2197
        /* Wrap each instruction in a scheduler structure. */
2198
        while (!list_is_empty(&block->instructions)) {
2199
                struct qinst *qinst = (struct qinst *)block->instructions.next;
2200
                struct schedule_node *n =
2201
                        rzalloc(mem_ctx, struct schedule_node);
2202

2203
                dag_init_node(scoreboard->dag, &n->dag);
2204
                n->inst = qinst;
2205

2206
                list_del(&qinst->link);
2207
                list_addtail(&n->link, &setup_list);
2208
        }
2209

2210
        calculate_forward_deps(c, scoreboard->dag, &setup_list);
2211
        calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2212

2213
        dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2214

2215
        uint32_t cycles = schedule_instructions(c, scoreboard, block,
2216
                                                orig_uniform_contents,
2217
                                                orig_uniform_data,
2218
                                                next_uniform);
2219

2220
        ralloc_free(mem_ctx);
2221
        scoreboard->dag = NULL;
2222

2223
        return cycles;
2224
}
2225

2226
static void
2227
qpu_set_branch_targets(struct v3d_compile *c)
2228
{
2229
        vir_for_each_block(block, c) {
2230
                /* The end block of the program has no branch. */
2231
                if (!block->successors[0])
2232
                        continue;
2233

2234
                /* If there was no branch instruction, then the successor
2235
                 * block must follow immediately after this one.
2236
                 */
2237
                if (block->branch_qpu_ip == ~0) {
2238
                        assert(block->end_qpu_ip + 1 ==
2239
                               block->successors[0]->start_qpu_ip);
2240
                        continue;
2241
                }
2242

2243
                /* Walk back through the delay slots to find the branch
2244
                 * instr.
2245
                 */
2246
                struct qinst *branch = NULL;
2247
                struct list_head *entry = block->instructions.prev;
2248
                int32_t delay_slot_count = -1;
2249
                struct qinst *delay_slots_start = NULL;
2250
                for (int i = 0; i < 3; i++) {
2251
                        entry = entry->prev;
2252
                        struct qinst *inst =
2253
                                container_of(entry, struct qinst, link);
2254

2255
                        if (delay_slot_count == -1) {
2256
                                if (!v3d_qpu_is_nop(&inst->qpu))
2257
                                        delay_slot_count = i;
2258
                                else
2259
                                        delay_slots_start = inst;
2260
                        }
2261

2262
                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2263
                                branch = inst;
2264
                                break;
2265
                        }
2266
                }
2267
                assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2268
                assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2269
                assert(delay_slot_count == 0 || delay_slots_start != NULL);
2270

2271
                /* Make sure that the if-we-don't-jump
2272
                 * successor was scheduled just after the
2273
                 * delay slots.
2274
                 */
2275
                assert(!block->successors[1] ||
2276
                       block->successors[1]->start_qpu_ip ==
2277
                       block->branch_qpu_ip + 4);
2278

2279
                branch->qpu.branch.offset =
2280
                        ((block->successors[0]->start_qpu_ip -
2281
                          (block->branch_qpu_ip + 4)) *
2282
                         sizeof(uint64_t));
2283

2284
                /* Set up the relative offset to jump in the
2285
                 * uniform stream.
2286
                 *
2287
                 * Use a temporary here, because
2288
                 * uniform_data[inst->uniform] may be shared
2289
                 * between multiple instructions.
2290
                 */
2291
                assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2292
                c->uniform_data[branch->uniform] =
2293
                        (block->successors[0]->start_uniform -
2294
                         (block->branch_uniform + 1)) * 4;
2295

2296
                /* If this is an unconditional branch, try to fill any remaining
2297
                 * delay slots with the initial instructions of the successor
2298
                 * block.
2299
                 *
2300
                 * FIXME: we can do the same for conditional branches if we
2301
                 * predicate the instructions to match the branch condition.
2302
                 */
2303
                if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2304
                        struct list_head *successor_insts =
2305
                                &block->successors[0]->instructions;
2306
                        delay_slot_count = MIN2(delay_slot_count,
2307
                                                list_length(successor_insts));
2308
                        struct qinst *s_inst =
2309
                                (struct qinst *) successor_insts->next;
2310
                        struct qinst *slot = delay_slots_start;
2311
                        int slots_filled = 0;
2312
                        while (slots_filled < delay_slot_count &&
2313
                               qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2314
                                memcpy(&slot->qpu, &s_inst->qpu,
2315
                                       sizeof(slot->qpu));
2316
                                s_inst = (struct qinst *) s_inst->link.next;
2317
                                slot = (struct qinst *) slot->link.next;
2318
                                slots_filled++;
2319
                        }
2320
                        branch->qpu.branch.offset +=
2321
                                slots_filled * sizeof(uint64_t);
2322
                }
2323
        }
2324
}
2325

2326
uint32_t
2327
v3d_qpu_schedule_instructions(struct v3d_compile *c)
2328
{
2329
        const struct v3d_device_info *devinfo = c->devinfo;
2330
        struct qblock *end_block = list_last_entry(&c->blocks,
2331
                                                   struct qblock, link);
2332

2333
        /* We reorder the uniforms as we schedule instructions, so save the
2334
         * old data off and replace it.
2335
         */
2336
        uint32_t *uniform_data = c->uniform_data;
2337
        enum quniform_contents *uniform_contents = c->uniform_contents;
2338
        c->uniform_contents = ralloc_array(c, enum quniform_contents,
2339
                                           c->num_uniforms);
2340
        c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2341
        c->uniform_array_size = c->num_uniforms;
2342
        uint32_t next_uniform = 0;
2343

2344
        struct choose_scoreboard scoreboard;
2345
        memset(&scoreboard, 0, sizeof(scoreboard));
2346
        scoreboard.last_ldvary_tick = -10;
2347
        scoreboard.last_unifa_write_tick = -10;
2348
        scoreboard.last_magic_sfu_write_tick = -10;
2349
        scoreboard.last_uniforms_reset_tick = -10;
2350
        scoreboard.last_thrsw_tick = -10;
2351
        scoreboard.last_branch_tick = -10;
2352
        scoreboard.last_setmsf_tick = -10;
2353
        scoreboard.last_stallable_sfu_tick = -10;
2354

2355
        if (debug) {
2356
                fprintf(stderr, "Pre-schedule instructions\n");
2357
                vir_for_each_block(block, c) {
2358
                        fprintf(stderr, "BLOCK %d\n", block->index);
2359
                        list_for_each_entry(struct qinst, qinst,
2360
                                            &block->instructions, link) {
2361
                                v3d_qpu_dump(devinfo, &qinst->qpu);
2362
                                fprintf(stderr, "\n");
2363
                        }
2364
                }
2365
                fprintf(stderr, "\n");
2366
        }
2367

2368
        uint32_t cycles = 0;
2369
        vir_for_each_block(block, c) {
2370
                block->start_qpu_ip = c->qpu_inst_count;
2371
                block->branch_qpu_ip = ~0;
2372
                block->start_uniform = next_uniform;
2373

2374
                cycles += qpu_schedule_instructions_block(c,
2375
                                                          &scoreboard,
2376
                                                          block,
2377
                                                          uniform_contents,
2378
                                                          uniform_data,
2379
                                                          &next_uniform);
2380

2381
                block->end_qpu_ip = c->qpu_inst_count - 1;
2382
        }
2383

2384
        /* Emit the program-end THRSW instruction. */;
2385
        struct qinst *thrsw = vir_nop();
2386
        thrsw->qpu.sig.thrsw = true;
2387
        emit_thrsw(c, end_block, &scoreboard, thrsw, true);
2388

2389
        qpu_set_branch_targets(c);
2390

2391
        assert(next_uniform == c->num_uniforms);
2392

2393
        return cycles;
2394
}
2395

2396
Product

Resources

Company