CoCalc -- vc4_qpu

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_emit.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright © 2014 Broadcom
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include <inttypes.h>
25

26
#include "vc4_context.h"
27
#include "vc4_qir.h"
28
#include "vc4_qpu.h"
29
#include "util/ralloc.h"
30

31
static void
32
vc4_dump_program(struct vc4_compile *c)
33
{
34
        fprintf(stderr, "%s prog %d/%d QPU:\n",
35
                qir_get_stage_name(c->stage),
36
                c->program_id, c->variant_id);
37

38
        for (int i = 0; i < c->qpu_inst_count; i++) {
39
                fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40
                vc4_qpu_disasm(&c->qpu_insts[i], 1);
41
                fprintf(stderr, "\n");
42
        }
43
        fprintf(stderr, "\n");
44
}
45

46
static void
47
queue(struct qblock *block, uint64_t inst)
48
{
49
        struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50
        q->inst = inst;
51
        list_addtail(&q->link, &block->qpu_inst_list);
52
}
53

54
static uint64_t *
55
last_inst(struct qblock *block)
56
{
57
        struct queued_qpu_inst *q =
58
                (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59
        return &q->inst;
60
}
61

62
static void
63
set_last_cond_add(struct qblock *block, uint32_t cond)
64
{
65
        *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66
}
67

68
static void
69
set_last_cond_mul(struct qblock *block, uint32_t cond)
70
{
71
        *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72
}
73

74
/**
75
 * Some special registers can be read from either file, which lets us resolve
76
 * raddr conflicts without extra MOVs.
77
 */
78
static bool
79
swap_file(struct qpu_reg *src)
80
{
81
        switch (src->addr) {
82
        case QPU_R_UNIF:
83
        case QPU_R_VARY:
84
                if (src->mux == QPU_MUX_SMALL_IMM) {
85
                        return false;
86
                } else {
87
                        if (src->mux == QPU_MUX_A)
88
                                src->mux = QPU_MUX_B;
89
                        else
90
                                src->mux = QPU_MUX_A;
91
                        return true;
92
                }
93

94
        default:
95
                return false;
96
        }
97
}
98

99
/**
100
 * Sets up the VPM read FIFO before we do any VPM read.
101
 *
102
 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
103
 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104
 * VPM block.  In the VS/CS (unlike in the FS), the block starts out
105
 * uninitialized, and you need to emit setup to the block before any VPM
106
 * reads/writes.
107
 *
108
 * VRI has a FIFO in each direction, with each FIFO able to hold four
109
 * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
110
 * writes go through the write FIFO.  The read/write setup values from QPU go
111
 * through the write FIFO as well, with a sideband signal indicating that
112
 * they're setup values.  Once a read setup reaches the other side of the
113
 * FIFO, the VPM block will start asynchronously reading vertex attributes and
114
 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
115
 * on reads later.
116
 *
117
 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118
 * time, which is 4 vec4s.  If more than that is being read (since we support
119
 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120
 *
121
 * The existence of the FIFO makes it seem like you should be able to emit
122
 * both setups for the 5-8 attribute cases and then do all the attribute
123
 * reads.  However, once the setup value makes it to the other end of the
124
 * write FIFO, it will immediately update the VPM block's setup register.
125
 * That updated setup register would be used for read FIFO fills from then on,
126
 * breaking whatever remaining VPM values were supposed to be read into the
127
 * read FIFO from the previous attribute set.
128
 *
129
 * As a result, we need to emit the read setup, pull every VPM read value from
130
 * that setup, and only then emit the second setup if applicable.
131
 */
132
static void
133
setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134
{
135
        if (c->num_inputs_in_fifo) {
136
                c->num_inputs_in_fifo--;
137
                return;
138
        }
139

140
        c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141

142
        queue(block,
143
              qpu_load_imm_ui(qpu_vrsetup(),
144
                              c->vpm_read_offset |
145
                              0x00001a00 |
146
                              ((c->num_inputs_in_fifo & 0xf) << 20)));
147
        c->num_inputs_remaining -= c->num_inputs_in_fifo;
148
        c->vpm_read_offset += c->num_inputs_in_fifo;
149

150
        c->num_inputs_in_fifo--;
151
}
152

153
/**
154
 * This is used to resolve the fact that we might register-allocate two
155
 * different operands of an instruction to the same physical register file
156
 * even though instructions have only one field for the register file source
157
 * address.
158
 *
159
 * In that case, we need to move one to a temporary that can be used in the
160
 * instruction, instead.  We reserve ra14/rb14 for this purpose.
161
 */
162
static void
163
fixup_raddr_conflict(struct qblock *block,
164
                     struct qpu_reg dst,
165
                     struct qpu_reg *src0, struct qpu_reg *src1,
166
                     struct qinst *inst, uint64_t *unpack)
167
{
168
        uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169
        uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170

171
        if (mux0 <= QPU_MUX_R5 ||
172
            mux0 != mux1 ||
173
            (src0->addr == src1->addr &&
174
             src0->mux == src1->mux)) {
175
                return;
176
        }
177

178
        if (swap_file(src0) || swap_file(src1))
179
                return;
180

181
        if (mux0 == QPU_MUX_A) {
182
                /* Make sure we use the same type of MOV as the instruction,
183
                 * in case of unpacks.
184
                 */
185
                if (qir_is_float_input(inst))
186
                        queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187
                else
188
                        queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189

190
                /* If we had an unpack on this A-file source, we need to put
191
                 * it into this MOV, not into the later move from regfile B.
192
                 */
193
                if (inst->src[0].pack) {
194
                        *last_inst(block) |= *unpack;
195
                        *unpack = 0;
196
                }
197
                *src0 = qpu_rb(14);
198
        } else {
199
                queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200
                *src0 = qpu_ra(14);
201
        }
202
}
203

204
static void
205
set_last_dst_pack(struct qblock *block, struct qinst *inst)
206
{
207
        ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
208
        ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
209
        ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210

211
        if (!inst->dst.pack)
212
                return;
213

214
        *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215

216
        if (qir_is_mul(inst)) {
217
                assert(!unpack || had_pm);
218
                *last_inst(block) |= QPU_PM;
219
        } else {
220
                assert(!unpack || !had_pm);
221
                assert(!had_ws); /* dst must be a-file to pack. */
222
        }
223
}
224

225
static void
226
handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227
                    struct qpu_reg dst)
228
{
229
        if (dst.mux != QPU_MUX_R4) {
230
                queue(block, qpu_a_MOV(dst, qpu_r4()));
231
                set_last_cond_add(block, qinst->cond);
232
        } else {
233
                assert(qinst->cond == QPU_COND_ALWAYS);
234
                if (qinst->sf)
235
                        queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
236
        }
237
}
238

239
static void
240
vc4_generate_code_block(struct vc4_compile *c,
241
                        struct qblock *block,
242
                        struct qpu_reg *temp_registers)
243
{
244
        int last_vpm_read_index = -1;
245

246
        qir_for_each_inst(qinst, block) {
247
#if 0
248
                fprintf(stderr, "translating qinst to qpu: ");
249
                qir_dump_inst(qinst);
250
                fprintf(stderr, "\n");
251
#endif
252

253
                static const struct {
254
                        uint32_t op;
255
                } translate[] = {
256
#define A(name) [QOP_##name] = {QPU_A_##name}
257
#define M(name) [QOP_##name] = {QPU_M_##name}
258
                        A(FADD),
259
                        A(FSUB),
260
                        A(FMIN),
261
                        A(FMAX),
262
                        A(FMINABS),
263
                        A(FMAXABS),
264
                        A(FTOI),
265
                        A(ITOF),
266
                        A(ADD),
267
                        A(SUB),
268
                        A(SHL),
269
                        A(SHR),
270
                        A(ASR),
271
                        A(MIN),
272
                        A(MAX),
273
                        A(AND),
274
                        A(OR),
275
                        A(XOR),
276
                        A(NOT),
277

278
                        M(FMUL),
279
                        M(V8MULD),
280
                        M(V8MIN),
281
                        M(V8MAX),
282
                        M(V8ADDS),
283
                        M(V8SUBS),
284
                        M(MUL24),
285

286
                        /* If we replicate src[0] out to src[1], this works
287
                         * out the same as a MOV.
288
                         */
289
                        [QOP_MOV] = { QPU_A_OR },
290
                        [QOP_FMOV] = { QPU_A_FMAX },
291
                        [QOP_MMOV] = { QPU_M_V8MIN },
292

293
                        [QOP_MIN_NOIMM] = { QPU_A_MIN },
294
                };
295

296
                uint64_t unpack = 0;
297
                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
298
                for (int i = 0; i < qir_get_nsrc(qinst); i++) {
299
                        int index = qinst->src[i].index;
300
                        switch (qinst->src[i].file) {
301
                        case QFILE_NULL:
302
                        case QFILE_LOAD_IMM:
303
                                src[i] = qpu_rn(0);
304
                                break;
305
                        case QFILE_TEMP:
306
                                src[i] = temp_registers[index];
307
                                if (qinst->src[i].pack) {
308
                                        assert(!unpack ||
309
                                               unpack == qinst->src[i].pack);
310
                                        unpack = QPU_SET_FIELD(qinst->src[i].pack,
311
                                                               QPU_UNPACK);
312
                                        if (src[i].mux == QPU_MUX_R4)
313
                                                unpack |= QPU_PM;
314
                                }
315
                                break;
316
                        case QFILE_UNIF:
317
                                src[i] = qpu_unif();
318
                                break;
319
                        case QFILE_VARY:
320
                                src[i] = qpu_vary();
321
                                break;
322
                        case QFILE_SMALL_IMM:
323
                                src[i].mux = QPU_MUX_SMALL_IMM;
324
                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
325
                                /* This should only have returned a valid
326
                                 * small immediate field, not ~0 for failure.
327
                                 */
328
                                assert(src[i].addr <= 47);
329
                                break;
330
                        case QFILE_VPM:
331
                                setup_for_vpm_read(c, block);
332
                                assert((int)qinst->src[i].index >=
333
                                       last_vpm_read_index);
334
                                (void)last_vpm_read_index;
335
                                last_vpm_read_index = qinst->src[i].index;
336
                                src[i] = qpu_ra(QPU_R_VPM);
337
                                break;
338

339
                        case QFILE_FRAG_X:
340
                                src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
341
                                break;
342
                        case QFILE_FRAG_Y:
343
                                src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
344
                                break;
345
                        case QFILE_FRAG_REV_FLAG:
346
                                src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
347
                                break;
348
                        case QFILE_QPU_ELEMENT:
349
                                src[i] = qpu_ra(QPU_R_ELEM_QPU);
350
                                break;
351

352
                        case QFILE_TLB_COLOR_WRITE:
353
                        case QFILE_TLB_COLOR_WRITE_MS:
354
                        case QFILE_TLB_Z_WRITE:
355
                        case QFILE_TLB_STENCIL_SETUP:
356
                        case QFILE_TEX_S:
357
                        case QFILE_TEX_S_DIRECT:
358
                        case QFILE_TEX_T:
359
                        case QFILE_TEX_R:
360
                        case QFILE_TEX_B:
361
                                unreachable("bad qir src file");
362
                        }
363
                }
364

365
                struct qpu_reg dst;
366
                switch (qinst->dst.file) {
367
                case QFILE_NULL:
368
                        dst = qpu_ra(QPU_W_NOP);
369
                        break;
370
                case QFILE_TEMP:
371
                        dst = temp_registers[qinst->dst.index];
372
                        break;
373
                case QFILE_VPM:
374
                        dst = qpu_ra(QPU_W_VPM);
375
                        break;
376

377
                case QFILE_TLB_COLOR_WRITE:
378
                        dst = qpu_tlbc();
379
                        break;
380

381
                case QFILE_TLB_COLOR_WRITE_MS:
382
                        dst = qpu_tlbc_ms();
383
                        break;
384

385
                case QFILE_TLB_Z_WRITE:
386
                        dst = qpu_ra(QPU_W_TLB_Z);
387
                        break;
388

389
                case QFILE_TLB_STENCIL_SETUP:
390
                        dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
391
                        break;
392

393
                case QFILE_TEX_S:
394
                case QFILE_TEX_S_DIRECT:
395
                        dst = qpu_rb(QPU_W_TMU0_S);
396
                        break;
397

398
                case QFILE_TEX_T:
399
                        dst = qpu_rb(QPU_W_TMU0_T);
400
                        break;
401

402
                case QFILE_TEX_R:
403
                        dst = qpu_rb(QPU_W_TMU0_R);
404
                        break;
405

406
                case QFILE_TEX_B:
407
                        dst = qpu_rb(QPU_W_TMU0_B);
408
                        break;
409

410
                case QFILE_VARY:
411
                case QFILE_UNIF:
412
                case QFILE_SMALL_IMM:
413
                case QFILE_LOAD_IMM:
414
                case QFILE_FRAG_X:
415
                case QFILE_FRAG_Y:
416
                case QFILE_FRAG_REV_FLAG:
417
                case QFILE_QPU_ELEMENT:
418
                        assert(!"not reached");
419
                        break;
420
                }
421

422
                ASSERTED bool handled_qinst_cond = false;
423

424
                switch (qinst->op) {
425
                case QOP_RCP:
426
                case QOP_RSQ:
427
                case QOP_EXP2:
428
                case QOP_LOG2:
429
                        switch (qinst->op) {
430
                        case QOP_RCP:
431
                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
432
                                                       src[0]) | unpack);
433
                                break;
434
                        case QOP_RSQ:
435
                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
436
                                                       src[0]) | unpack);
437
                                break;
438
                        case QOP_EXP2:
439
                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
440
                                                       src[0]) | unpack);
441
                                break;
442
                        case QOP_LOG2:
443
                                queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
444
                                                       src[0]) | unpack);
445
                                break;
446
                        default:
447
                                abort();
448
                        }
449

450
                        handle_r4_qpu_write(block, qinst, dst);
451
                        handled_qinst_cond = true;
452

453
                        break;
454

455
                case QOP_LOAD_IMM:
456
                        assert(qinst->src[0].file == QFILE_LOAD_IMM);
457
                        queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
458
                        break;
459

460
                case QOP_LOAD_IMM_U2:
461
                        queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
462
                        break;
463

464
                case QOP_LOAD_IMM_I2:
465
                        queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
466
                        break;
467

468
                case QOP_ROT_MUL:
469
                        /* Rotation at the hardware level occurs on the inputs
470
                         * to the MUL unit, and they must be accumulators in
471
                         * order to have the time necessary to move things.
472
                         */
473
                        assert(src[0].mux <= QPU_MUX_R3);
474

475
                        queue(block,
476
                              qpu_m_rot(dst, src[0], qinst->src[1].index -
477
                                        QPU_SMALL_IMM_MUL_ROT) | unpack);
478
                        set_last_cond_mul(block, qinst->cond);
479
                        handled_qinst_cond = true;
480
                        set_last_dst_pack(block, qinst);
481
                        break;
482

483
                case QOP_MS_MASK:
484
                        src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
485
                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
486
                                             qinst, &unpack);
487
                        queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
488
                                               src[0], src[1]) | unpack);
489
                        break;
490

491
                case QOP_FRAG_Z:
492
                case QOP_FRAG_W:
493
                        /* QOP_FRAG_Z/W don't emit instructions, just allocate
494
                         * the register to the Z/W payload.
495
                         */
496
                        break;
497

498
                case QOP_TLB_COLOR_READ:
499
                        queue(block, qpu_NOP());
500
                        *last_inst(block) = qpu_set_sig(*last_inst(block),
501
                                                        QPU_SIG_COLOR_LOAD);
502
                        handle_r4_qpu_write(block, qinst, dst);
503
                        handled_qinst_cond = true;
504
                        break;
505

506
                case QOP_VARY_ADD_C:
507
                        queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
508
                        break;
509

510

511
                case QOP_TEX_RESULT:
512
                        queue(block, qpu_NOP());
513
                        *last_inst(block) = qpu_set_sig(*last_inst(block),
514
                                                        QPU_SIG_LOAD_TMU0);
515
                        handle_r4_qpu_write(block, qinst, dst);
516
                        handled_qinst_cond = true;
517
                        break;
518

519
                case QOP_THRSW:
520
                        queue(block, qpu_NOP());
521
                        *last_inst(block) = qpu_set_sig(*last_inst(block),
522
                                                        QPU_SIG_THREAD_SWITCH);
523
                        c->last_thrsw = last_inst(block);
524
                        break;
525

526
                case QOP_BRANCH:
527
                        /* The branch target will be updated at QPU scheduling
528
                         * time.
529
                         */
530
                        queue(block, (qpu_branch(qinst->cond, 0) |
531
                                      QPU_BRANCH_REL));
532
                        handled_qinst_cond = true;
533
                        break;
534

535
                case QOP_UNIFORMS_RESET:
536
                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
537
                                             qinst, &unpack);
538

539
                        queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
540
                                               src[0], src[1]));
541
                        break;
542

543
                default:
544
                        assert(qinst->op < ARRAY_SIZE(translate));
545
                        assert(translate[qinst->op].op != 0); /* NOPs */
546

547
                        /* Skip emitting the MOV if it's a no-op. */
548
                        if (qir_is_raw_mov(qinst) &&
549
                            dst.mux == src[0].mux && dst.addr == src[0].addr) {
550
                                break;
551
                        }
552

553
                        /* If we have only one source, put it in the second
554
                         * argument slot as well so that we don't take up
555
                         * another raddr just to get unused data.
556
                         */
557
                        if (qir_get_non_sideband_nsrc(qinst) == 1)
558
                                src[1] = src[0];
559

560
                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
561
                                             qinst, &unpack);
562

563
                        if (qir_is_mul(qinst)) {
564
                                queue(block, qpu_m_alu2(translate[qinst->op].op,
565
                                                        dst,
566
                                                        src[0], src[1]) | unpack);
567
                                set_last_cond_mul(block, qinst->cond);
568
                        } else {
569
                                queue(block, qpu_a_alu2(translate[qinst->op].op,
570
                                                        dst,
571
                                                        src[0], src[1]) | unpack);
572
                                set_last_cond_add(block, qinst->cond);
573
                        }
574
                        handled_qinst_cond = true;
575
                        set_last_dst_pack(block, qinst);
576

577
                        break;
578
                }
579

580
                assert(qinst->cond == QPU_COND_ALWAYS ||
581
                       handled_qinst_cond);
582

583
                if (qinst->sf)
584
                        *last_inst(block) |= QPU_SF;
585
        }
586
}
587

588
void
589
vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
590
{
591
        struct qblock *start_block = list_first_entry(&c->blocks,
592
                                                      struct qblock, link);
593

594
        struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
595
        if (!temp_registers)
596
                return;
597

598
        switch (c->stage) {
599
        case QSTAGE_VERT:
600
        case QSTAGE_COORD:
601
                c->num_inputs_remaining = c->num_inputs;
602
                queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
603
                break;
604
        case QSTAGE_FRAG:
605
                break;
606
        }
607

608
        qir_for_each_block(block, c)
609
                vc4_generate_code_block(c, block, temp_registers);
610

611
        /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
612
         *
613
         * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
614
         * that ensures that a later thread doesn't try to lock the scoreboard
615
         * and terminate before an earlier-spawned thread on the same QPU, by
616
         * delaying switching back to the later shader until earlier has
617
         * finished.  Otherwise, if the earlier thread was hitting the same
618
         * quad, the scoreboard would deadlock.
619
         */
620
        if (c->last_thrsw) {
621
                assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
622
                       QPU_SIG_THREAD_SWITCH);
623
                *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
624
                                  QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
625
                                                QPU_SIG));
626
        }
627

628
        uint32_t cycles = qpu_schedule_instructions(c);
629
        uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
630

631
        /* thread end can't have VPM write or read */
632
        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
633
                          QPU_WADDR_ADD) == QPU_W_VPM ||
634
            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
635
                          QPU_WADDR_MUL) == QPU_W_VPM ||
636
            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
637
                          QPU_RADDR_A) == QPU_R_VPM ||
638
            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
639
                          QPU_RADDR_B) == QPU_R_VPM) {
640
                qpu_serialize_one_inst(c, qpu_NOP());
641
        }
642

643
        /* thread end can't have uniform read */
644
        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
645
                          QPU_RADDR_A) == QPU_R_UNIF ||
646
            QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
647
                          QPU_RADDR_B) == QPU_R_UNIF) {
648
                qpu_serialize_one_inst(c, qpu_NOP());
649
        }
650

651
        /* thread end can't have TLB operations */
652
        if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
653
                qpu_serialize_one_inst(c, qpu_NOP());
654

655
        /* Make sure there's no existing signal set (like for a small
656
         * immediate)
657
         */
658
        if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
659
                          QPU_SIG) != QPU_SIG_NONE) {
660
                qpu_serialize_one_inst(c, qpu_NOP());
661
        }
662

663
        c->qpu_insts[c->qpu_inst_count - 1] =
664
                qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
665
                            QPU_SIG_PROG_END);
666
        qpu_serialize_one_inst(c, qpu_NOP());
667
        qpu_serialize_one_inst(c, qpu_NOP());
668

669
        switch (c->stage) {
670
        case QSTAGE_VERT:
671
        case QSTAGE_COORD:
672
                break;
673
        case QSTAGE_FRAG:
674
                c->qpu_insts[c->qpu_inst_count - 1] =
675
                        qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
676
                                    QPU_SIG_SCOREBOARD_UNLOCK);
677
                break;
678
        }
679

680
        cycles += c->qpu_inst_count - inst_count_at_schedule_time;
681

682
        if (vc4_debug & VC4_DEBUG_SHADERDB) {
683
                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
684
                        qir_get_stage_name(c->stage),
685
                        c->program_id, c->variant_id,
686
                        cycles);
687
        }
688

689
        if (vc4_debug & VC4_DEBUG_QPU)
690
                vc4_dump_program(c);
691

692
        vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
693

694
        free(temp_registers);
695
}
696

697
Product

Resources

Company