Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_emit.c
4570 views
1
/*
2
* Copyright © 2014 Broadcom
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include <inttypes.h>
25
26
#include "vc4_context.h"
27
#include "vc4_qir.h"
28
#include "vc4_qpu.h"
29
#include "util/ralloc.h"
30
31
static void
32
vc4_dump_program(struct vc4_compile *c)
33
{
34
fprintf(stderr, "%s prog %d/%d QPU:\n",
35
qir_get_stage_name(c->stage),
36
c->program_id, c->variant_id);
37
38
for (int i = 0; i < c->qpu_inst_count; i++) {
39
fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40
vc4_qpu_disasm(&c->qpu_insts[i], 1);
41
fprintf(stderr, "\n");
42
}
43
fprintf(stderr, "\n");
44
}
45
46
static void
47
queue(struct qblock *block, uint64_t inst)
48
{
49
struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50
q->inst = inst;
51
list_addtail(&q->link, &block->qpu_inst_list);
52
}
53
54
static uint64_t *
55
last_inst(struct qblock *block)
56
{
57
struct queued_qpu_inst *q =
58
(struct queued_qpu_inst *)block->qpu_inst_list.prev;
59
return &q->inst;
60
}
61
62
static void
63
set_last_cond_add(struct qblock *block, uint32_t cond)
64
{
65
*last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66
}
67
68
static void
69
set_last_cond_mul(struct qblock *block, uint32_t cond)
70
{
71
*last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72
}
73
74
/**
75
* Some special registers can be read from either file, which lets us resolve
76
* raddr conflicts without extra MOVs.
77
*/
78
static bool
79
swap_file(struct qpu_reg *src)
80
{
81
switch (src->addr) {
82
case QPU_R_UNIF:
83
case QPU_R_VARY:
84
if (src->mux == QPU_MUX_SMALL_IMM) {
85
return false;
86
} else {
87
if (src->mux == QPU_MUX_A)
88
src->mux = QPU_MUX_B;
89
else
90
src->mux = QPU_MUX_A;
91
return true;
92
}
93
94
default:
95
return false;
96
}
97
}
98
99
/**
100
* Sets up the VPM read FIFO before we do any VPM read.
101
*
102
* VPM reads (vertex attribute input) and VPM writes (varyings output) from
103
* the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104
* VPM block. In the VS/CS (unlike in the FS), the block starts out
105
* uninitialized, and you need to emit setup to the block before any VPM
106
* reads/writes.
107
*
108
* VRI has a FIFO in each direction, with each FIFO able to hold four
109
* 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
110
* writes go through the write FIFO. The read/write setup values from QPU go
111
* through the write FIFO as well, with a sideband signal indicating that
112
* they're setup values. Once a read setup reaches the other side of the
113
* FIFO, the VPM block will start asynchronously reading vertex attributes and
114
* filling the read FIFO -- that way hopefully the QPU doesn't have to block
115
* on reads later.
116
*
117
* VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118
* time, which is 4 vec4s. If more than that is being read (since we support
119
* 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120
*
121
* The existence of the FIFO makes it seem like you should be able to emit
122
* both setups for the 5-8 attribute cases and then do all the attribute
123
* reads. However, once the setup value makes it to the other end of the
124
* write FIFO, it will immediately update the VPM block's setup register.
125
* That updated setup register would be used for read FIFO fills from then on,
126
* breaking whatever remaining VPM values were supposed to be read into the
127
* read FIFO from the previous attribute set.
128
*
129
* As a result, we need to emit the read setup, pull every VPM read value from
130
* that setup, and only then emit the second setup if applicable.
131
*/
132
static void
133
setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134
{
135
if (c->num_inputs_in_fifo) {
136
c->num_inputs_in_fifo--;
137
return;
138
}
139
140
c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141
142
queue(block,
143
qpu_load_imm_ui(qpu_vrsetup(),
144
c->vpm_read_offset |
145
0x00001a00 |
146
((c->num_inputs_in_fifo & 0xf) << 20)));
147
c->num_inputs_remaining -= c->num_inputs_in_fifo;
148
c->vpm_read_offset += c->num_inputs_in_fifo;
149
150
c->num_inputs_in_fifo--;
151
}
152
153
/**
154
* This is used to resolve the fact that we might register-allocate two
155
* different operands of an instruction to the same physical register file
156
* even though instructions have only one field for the register file source
157
* address.
158
*
159
* In that case, we need to move one to a temporary that can be used in the
160
* instruction, instead. We reserve ra14/rb14 for this purpose.
161
*/
162
static void
163
fixup_raddr_conflict(struct qblock *block,
164
struct qpu_reg dst,
165
struct qpu_reg *src0, struct qpu_reg *src1,
166
struct qinst *inst, uint64_t *unpack)
167
{
168
uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169
uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170
171
if (mux0 <= QPU_MUX_R5 ||
172
mux0 != mux1 ||
173
(src0->addr == src1->addr &&
174
src0->mux == src1->mux)) {
175
return;
176
}
177
178
if (swap_file(src0) || swap_file(src1))
179
return;
180
181
if (mux0 == QPU_MUX_A) {
182
/* Make sure we use the same type of MOV as the instruction,
183
* in case of unpacks.
184
*/
185
if (qir_is_float_input(inst))
186
queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187
else
188
queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189
190
/* If we had an unpack on this A-file source, we need to put
191
* it into this MOV, not into the later move from regfile B.
192
*/
193
if (inst->src[0].pack) {
194
*last_inst(block) |= *unpack;
195
*unpack = 0;
196
}
197
*src0 = qpu_rb(14);
198
} else {
199
queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200
*src0 = qpu_ra(14);
201
}
202
}
203
204
static void
205
set_last_dst_pack(struct qblock *block, struct qinst *inst)
206
{
207
ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
208
ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
209
ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210
211
if (!inst->dst.pack)
212
return;
213
214
*last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215
216
if (qir_is_mul(inst)) {
217
assert(!unpack || had_pm);
218
*last_inst(block) |= QPU_PM;
219
} else {
220
assert(!unpack || !had_pm);
221
assert(!had_ws); /* dst must be a-file to pack. */
222
}
223
}
224
225
static void
226
handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227
struct qpu_reg dst)
228
{
229
if (dst.mux != QPU_MUX_R4) {
230
queue(block, qpu_a_MOV(dst, qpu_r4()));
231
set_last_cond_add(block, qinst->cond);
232
} else {
233
assert(qinst->cond == QPU_COND_ALWAYS);
234
if (qinst->sf)
235
queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
236
}
237
}
238
239
static void
240
vc4_generate_code_block(struct vc4_compile *c,
241
struct qblock *block,
242
struct qpu_reg *temp_registers)
243
{
244
int last_vpm_read_index = -1;
245
246
qir_for_each_inst(qinst, block) {
247
#if 0
248
fprintf(stderr, "translating qinst to qpu: ");
249
qir_dump_inst(qinst);
250
fprintf(stderr, "\n");
251
#endif
252
253
static const struct {
254
uint32_t op;
255
} translate[] = {
256
#define A(name) [QOP_##name] = {QPU_A_##name}
257
#define M(name) [QOP_##name] = {QPU_M_##name}
258
A(FADD),
259
A(FSUB),
260
A(FMIN),
261
A(FMAX),
262
A(FMINABS),
263
A(FMAXABS),
264
A(FTOI),
265
A(ITOF),
266
A(ADD),
267
A(SUB),
268
A(SHL),
269
A(SHR),
270
A(ASR),
271
A(MIN),
272
A(MAX),
273
A(AND),
274
A(OR),
275
A(XOR),
276
A(NOT),
277
278
M(FMUL),
279
M(V8MULD),
280
M(V8MIN),
281
M(V8MAX),
282
M(V8ADDS),
283
M(V8SUBS),
284
M(MUL24),
285
286
/* If we replicate src[0] out to src[1], this works
287
* out the same as a MOV.
288
*/
289
[QOP_MOV] = { QPU_A_OR },
290
[QOP_FMOV] = { QPU_A_FMAX },
291
[QOP_MMOV] = { QPU_M_V8MIN },
292
293
[QOP_MIN_NOIMM] = { QPU_A_MIN },
294
};
295
296
uint64_t unpack = 0;
297
struct qpu_reg src[ARRAY_SIZE(qinst->src)];
298
for (int i = 0; i < qir_get_nsrc(qinst); i++) {
299
int index = qinst->src[i].index;
300
switch (qinst->src[i].file) {
301
case QFILE_NULL:
302
case QFILE_LOAD_IMM:
303
src[i] = qpu_rn(0);
304
break;
305
case QFILE_TEMP:
306
src[i] = temp_registers[index];
307
if (qinst->src[i].pack) {
308
assert(!unpack ||
309
unpack == qinst->src[i].pack);
310
unpack = QPU_SET_FIELD(qinst->src[i].pack,
311
QPU_UNPACK);
312
if (src[i].mux == QPU_MUX_R4)
313
unpack |= QPU_PM;
314
}
315
break;
316
case QFILE_UNIF:
317
src[i] = qpu_unif();
318
break;
319
case QFILE_VARY:
320
src[i] = qpu_vary();
321
break;
322
case QFILE_SMALL_IMM:
323
src[i].mux = QPU_MUX_SMALL_IMM;
324
src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
325
/* This should only have returned a valid
326
* small immediate field, not ~0 for failure.
327
*/
328
assert(src[i].addr <= 47);
329
break;
330
case QFILE_VPM:
331
setup_for_vpm_read(c, block);
332
assert((int)qinst->src[i].index >=
333
last_vpm_read_index);
334
(void)last_vpm_read_index;
335
last_vpm_read_index = qinst->src[i].index;
336
src[i] = qpu_ra(QPU_R_VPM);
337
break;
338
339
case QFILE_FRAG_X:
340
src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
341
break;
342
case QFILE_FRAG_Y:
343
src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
344
break;
345
case QFILE_FRAG_REV_FLAG:
346
src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
347
break;
348
case QFILE_QPU_ELEMENT:
349
src[i] = qpu_ra(QPU_R_ELEM_QPU);
350
break;
351
352
case QFILE_TLB_COLOR_WRITE:
353
case QFILE_TLB_COLOR_WRITE_MS:
354
case QFILE_TLB_Z_WRITE:
355
case QFILE_TLB_STENCIL_SETUP:
356
case QFILE_TEX_S:
357
case QFILE_TEX_S_DIRECT:
358
case QFILE_TEX_T:
359
case QFILE_TEX_R:
360
case QFILE_TEX_B:
361
unreachable("bad qir src file");
362
}
363
}
364
365
struct qpu_reg dst;
366
switch (qinst->dst.file) {
367
case QFILE_NULL:
368
dst = qpu_ra(QPU_W_NOP);
369
break;
370
case QFILE_TEMP:
371
dst = temp_registers[qinst->dst.index];
372
break;
373
case QFILE_VPM:
374
dst = qpu_ra(QPU_W_VPM);
375
break;
376
377
case QFILE_TLB_COLOR_WRITE:
378
dst = qpu_tlbc();
379
break;
380
381
case QFILE_TLB_COLOR_WRITE_MS:
382
dst = qpu_tlbc_ms();
383
break;
384
385
case QFILE_TLB_Z_WRITE:
386
dst = qpu_ra(QPU_W_TLB_Z);
387
break;
388
389
case QFILE_TLB_STENCIL_SETUP:
390
dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
391
break;
392
393
case QFILE_TEX_S:
394
case QFILE_TEX_S_DIRECT:
395
dst = qpu_rb(QPU_W_TMU0_S);
396
break;
397
398
case QFILE_TEX_T:
399
dst = qpu_rb(QPU_W_TMU0_T);
400
break;
401
402
case QFILE_TEX_R:
403
dst = qpu_rb(QPU_W_TMU0_R);
404
break;
405
406
case QFILE_TEX_B:
407
dst = qpu_rb(QPU_W_TMU0_B);
408
break;
409
410
case QFILE_VARY:
411
case QFILE_UNIF:
412
case QFILE_SMALL_IMM:
413
case QFILE_LOAD_IMM:
414
case QFILE_FRAG_X:
415
case QFILE_FRAG_Y:
416
case QFILE_FRAG_REV_FLAG:
417
case QFILE_QPU_ELEMENT:
418
assert(!"not reached");
419
break;
420
}
421
422
ASSERTED bool handled_qinst_cond = false;
423
424
switch (qinst->op) {
425
case QOP_RCP:
426
case QOP_RSQ:
427
case QOP_EXP2:
428
case QOP_LOG2:
429
switch (qinst->op) {
430
case QOP_RCP:
431
queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
432
src[0]) | unpack);
433
break;
434
case QOP_RSQ:
435
queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
436
src[0]) | unpack);
437
break;
438
case QOP_EXP2:
439
queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
440
src[0]) | unpack);
441
break;
442
case QOP_LOG2:
443
queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
444
src[0]) | unpack);
445
break;
446
default:
447
abort();
448
}
449
450
handle_r4_qpu_write(block, qinst, dst);
451
handled_qinst_cond = true;
452
453
break;
454
455
case QOP_LOAD_IMM:
456
assert(qinst->src[0].file == QFILE_LOAD_IMM);
457
queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
458
break;
459
460
case QOP_LOAD_IMM_U2:
461
queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
462
break;
463
464
case QOP_LOAD_IMM_I2:
465
queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
466
break;
467
468
case QOP_ROT_MUL:
469
/* Rotation at the hardware level occurs on the inputs
470
* to the MUL unit, and they must be accumulators in
471
* order to have the time necessary to move things.
472
*/
473
assert(src[0].mux <= QPU_MUX_R3);
474
475
queue(block,
476
qpu_m_rot(dst, src[0], qinst->src[1].index -
477
QPU_SMALL_IMM_MUL_ROT) | unpack);
478
set_last_cond_mul(block, qinst->cond);
479
handled_qinst_cond = true;
480
set_last_dst_pack(block, qinst);
481
break;
482
483
case QOP_MS_MASK:
484
src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
485
fixup_raddr_conflict(block, dst, &src[0], &src[1],
486
qinst, &unpack);
487
queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
488
src[0], src[1]) | unpack);
489
break;
490
491
case QOP_FRAG_Z:
492
case QOP_FRAG_W:
493
/* QOP_FRAG_Z/W don't emit instructions, just allocate
494
* the register to the Z/W payload.
495
*/
496
break;
497
498
case QOP_TLB_COLOR_READ:
499
queue(block, qpu_NOP());
500
*last_inst(block) = qpu_set_sig(*last_inst(block),
501
QPU_SIG_COLOR_LOAD);
502
handle_r4_qpu_write(block, qinst, dst);
503
handled_qinst_cond = true;
504
break;
505
506
case QOP_VARY_ADD_C:
507
queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
508
break;
509
510
511
case QOP_TEX_RESULT:
512
queue(block, qpu_NOP());
513
*last_inst(block) = qpu_set_sig(*last_inst(block),
514
QPU_SIG_LOAD_TMU0);
515
handle_r4_qpu_write(block, qinst, dst);
516
handled_qinst_cond = true;
517
break;
518
519
case QOP_THRSW:
520
queue(block, qpu_NOP());
521
*last_inst(block) = qpu_set_sig(*last_inst(block),
522
QPU_SIG_THREAD_SWITCH);
523
c->last_thrsw = last_inst(block);
524
break;
525
526
case QOP_BRANCH:
527
/* The branch target will be updated at QPU scheduling
528
* time.
529
*/
530
queue(block, (qpu_branch(qinst->cond, 0) |
531
QPU_BRANCH_REL));
532
handled_qinst_cond = true;
533
break;
534
535
case QOP_UNIFORMS_RESET:
536
fixup_raddr_conflict(block, dst, &src[0], &src[1],
537
qinst, &unpack);
538
539
queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
540
src[0], src[1]));
541
break;
542
543
default:
544
assert(qinst->op < ARRAY_SIZE(translate));
545
assert(translate[qinst->op].op != 0); /* NOPs */
546
547
/* Skip emitting the MOV if it's a no-op. */
548
if (qir_is_raw_mov(qinst) &&
549
dst.mux == src[0].mux && dst.addr == src[0].addr) {
550
break;
551
}
552
553
/* If we have only one source, put it in the second
554
* argument slot as well so that we don't take up
555
* another raddr just to get unused data.
556
*/
557
if (qir_get_non_sideband_nsrc(qinst) == 1)
558
src[1] = src[0];
559
560
fixup_raddr_conflict(block, dst, &src[0], &src[1],
561
qinst, &unpack);
562
563
if (qir_is_mul(qinst)) {
564
queue(block, qpu_m_alu2(translate[qinst->op].op,
565
dst,
566
src[0], src[1]) | unpack);
567
set_last_cond_mul(block, qinst->cond);
568
} else {
569
queue(block, qpu_a_alu2(translate[qinst->op].op,
570
dst,
571
src[0], src[1]) | unpack);
572
set_last_cond_add(block, qinst->cond);
573
}
574
handled_qinst_cond = true;
575
set_last_dst_pack(block, qinst);
576
577
break;
578
}
579
580
assert(qinst->cond == QPU_COND_ALWAYS ||
581
handled_qinst_cond);
582
583
if (qinst->sf)
584
*last_inst(block) |= QPU_SF;
585
}
586
}
587
588
void
589
vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
590
{
591
struct qblock *start_block = list_first_entry(&c->blocks,
592
struct qblock, link);
593
594
struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
595
if (!temp_registers)
596
return;
597
598
switch (c->stage) {
599
case QSTAGE_VERT:
600
case QSTAGE_COORD:
601
c->num_inputs_remaining = c->num_inputs;
602
queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
603
break;
604
case QSTAGE_FRAG:
605
break;
606
}
607
608
qir_for_each_block(block, c)
609
vc4_generate_code_block(c, block, temp_registers);
610
611
/* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
612
*
613
* LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
614
* that ensures that a later thread doesn't try to lock the scoreboard
615
* and terminate before an earlier-spawned thread on the same QPU, by
616
* delaying switching back to the later shader until earlier has
617
* finished. Otherwise, if the earlier thread was hitting the same
618
* quad, the scoreboard would deadlock.
619
*/
620
if (c->last_thrsw) {
621
assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
622
QPU_SIG_THREAD_SWITCH);
623
*c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
624
QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
625
QPU_SIG));
626
}
627
628
uint32_t cycles = qpu_schedule_instructions(c);
629
uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
630
631
/* thread end can't have VPM write or read */
632
if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
633
QPU_WADDR_ADD) == QPU_W_VPM ||
634
QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
635
QPU_WADDR_MUL) == QPU_W_VPM ||
636
QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
637
QPU_RADDR_A) == QPU_R_VPM ||
638
QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
639
QPU_RADDR_B) == QPU_R_VPM) {
640
qpu_serialize_one_inst(c, qpu_NOP());
641
}
642
643
/* thread end can't have uniform read */
644
if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
645
QPU_RADDR_A) == QPU_R_UNIF ||
646
QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
647
QPU_RADDR_B) == QPU_R_UNIF) {
648
qpu_serialize_one_inst(c, qpu_NOP());
649
}
650
651
/* thread end can't have TLB operations */
652
if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
653
qpu_serialize_one_inst(c, qpu_NOP());
654
655
/* Make sure there's no existing signal set (like for a small
656
* immediate)
657
*/
658
if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
659
QPU_SIG) != QPU_SIG_NONE) {
660
qpu_serialize_one_inst(c, qpu_NOP());
661
}
662
663
c->qpu_insts[c->qpu_inst_count - 1] =
664
qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
665
QPU_SIG_PROG_END);
666
qpu_serialize_one_inst(c, qpu_NOP());
667
qpu_serialize_one_inst(c, qpu_NOP());
668
669
switch (c->stage) {
670
case QSTAGE_VERT:
671
case QSTAGE_COORD:
672
break;
673
case QSTAGE_FRAG:
674
c->qpu_insts[c->qpu_inst_count - 1] =
675
qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
676
QPU_SIG_SCOREBOARD_UNLOCK);
677
break;
678
}
679
680
cycles += c->qpu_inst_count - inst_count_at_schedule_time;
681
682
if (vc4_debug & VC4_DEBUG_SHADERDB) {
683
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
684
qir_get_stage_name(c->stage),
685
c->program_id, c->variant_id,
686
cycles);
687
}
688
689
if (vc4_debug & VC4_DEBUG_QPU)
690
vc4_dump_program(c);
691
692
vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
693
694
free(temp_registers);
695
}
696
697