Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_validate.c
4570 views
1
2
/*
3
* Copyright © 2014 Broadcom
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*/
24
25
#include <stdlib.h>
26
27
#include "vc4_qpu.h"
28
29
static void
30
fail_instr(uint64_t inst, const char *msg)
31
{
32
fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
33
vc4_qpu_disasm(&inst, 1);
34
fprintf(stderr, "\n");
35
abort();
36
}
37
38
static bool
39
writes_reg(uint64_t inst, uint32_t w)
40
{
41
return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
42
QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
43
}
44
45
static bool
46
_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
47
{
48
struct {
49
uint32_t mux, addr;
50
} src_regs[] = {
51
{ QPU_GET_FIELD(inst, QPU_ADD_A) },
52
{ QPU_GET_FIELD(inst, QPU_ADD_B) },
53
{ QPU_GET_FIELD(inst, QPU_MUL_A) },
54
{ QPU_GET_FIELD(inst, QPU_MUL_B) },
55
};
56
57
/* Branches only reference raddr_a (no mux), and we don't use that
58
* feature of branching.
59
*/
60
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
61
return false;
62
63
/* Load immediates don't read any registers. */
64
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
65
return false;
66
67
for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
68
if (!ignore_a &&
69
src_regs[i].mux == QPU_MUX_A &&
70
(QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
71
return true;
72
73
if (!ignore_b &&
74
QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
75
src_regs[i].mux == QPU_MUX_B &&
76
(QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
77
return true;
78
}
79
80
return false;
81
}
82
83
static bool
84
reads_reg(uint64_t inst, uint32_t r)
85
{
86
return _reads_reg(inst, r, false, false);
87
}
88
89
static bool
90
reads_a_reg(uint64_t inst, uint32_t r)
91
{
92
return _reads_reg(inst, r, false, true);
93
}
94
95
static bool
96
reads_b_reg(uint64_t inst, uint32_t r)
97
{
98
return _reads_reg(inst, r, true, false);
99
}
100
101
static bool
102
writes_sfu(uint64_t inst)
103
{
104
return (writes_reg(inst, QPU_W_SFU_RECIP) ||
105
writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
106
writes_reg(inst, QPU_W_SFU_EXP) ||
107
writes_reg(inst, QPU_W_SFU_LOG));
108
}
109
110
/**
111
* Checks for the instruction restrictions from page 37 ("Summary of
112
* Instruction Restrictions").
113
*/
114
void
115
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
116
{
117
bool scoreboard_locked = false;
118
bool threaded = false;
119
120
/* We don't want to do validation in release builds, but we want to
121
* keep compiling the validation code to make sure it doesn't get
122
* broken.
123
*/
124
#ifndef DEBUG
125
return;
126
#endif
127
128
for (int i = 0; i < num_inst; i++) {
129
uint64_t inst = insts[i];
130
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
131
132
if (sig != QPU_SIG_PROG_END) {
133
if (qpu_inst_is_tlb(inst))
134
scoreboard_locked = true;
135
136
if (sig == QPU_SIG_THREAD_SWITCH ||
137
sig == QPU_SIG_LAST_THREAD_SWITCH) {
138
threaded = true;
139
}
140
141
continue;
142
}
143
144
/* "The Thread End instruction must not write to either physical
145
* regfile A or B."
146
*/
147
if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
148
QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
149
fail_instr(inst, "write to phys reg in thread end");
150
}
151
152
/* Can't trigger an implicit wait on scoreboard in the program
153
* end instruction.
154
*/
155
if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
156
fail_instr(inst, "implicit sb wait in program end");
157
158
/* Two delay slots will be executed. */
159
assert(i + 2 <= num_inst);
160
161
for (int j = i; j < i + 2; j++) {
162
/* "The last three instructions of any program
163
* (Thread End plus the following two delay-slot
164
* instructions) must not do varyings read, uniforms
165
* read or any kind of VPM, VDR, or VDW read or
166
* write."
167
*/
168
if (writes_reg(insts[j], QPU_W_VPM) ||
169
reads_reg(insts[j], QPU_R_VARY) ||
170
reads_reg(insts[j], QPU_R_UNIF) ||
171
reads_reg(insts[j], QPU_R_VPM)) {
172
fail_instr(insts[j], "last 3 instructions "
173
"using fixed functions");
174
}
175
176
/* "The Thread End instruction and the following two
177
* delay slot instructions must not write or read
178
* address 14 in either regfile A or B."
179
*/
180
if (writes_reg(insts[j], 14) ||
181
reads_reg(insts[j], 14)) {
182
fail_instr(insts[j], "last 3 instructions "
183
"must not use r14");
184
}
185
}
186
187
/* "The final program instruction (the second delay slot
188
* instruction) must not do a TLB Z write."
189
*/
190
if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
191
fail_instr(insts[i + 2], "final instruction doing "
192
"Z write");
193
}
194
}
195
196
/* "A scoreboard wait must not occur in the first two instructions of
197
* a fragment shader. This is either the explicit Wait for Scoreboard
198
* signal or an implicit wait with the first tile-buffer read or
199
* write instruction."
200
*/
201
for (int i = 0; i < 2; i++) {
202
uint64_t inst = insts[i];
203
204
if (qpu_inst_is_tlb(inst))
205
fail_instr(inst, "sb wait in first two insts");
206
}
207
208
/* "If TMU_NOSWAP is written, the write must be three instructions
209
* before the first TMU write instruction. For example, if
210
* TMU_NOSWAP is written in the first shader instruction, the first
211
* TMU write cannot occur before the 4th shader instruction."
212
*/
213
int last_tmu_noswap = -10;
214
for (int i = 0; i < num_inst; i++) {
215
uint64_t inst = insts[i];
216
217
if ((i - last_tmu_noswap) <= 3 &&
218
(writes_reg(inst, QPU_W_TMU0_S) ||
219
writes_reg(inst, QPU_W_TMU1_S))) {
220
fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
221
}
222
223
if (writes_reg(inst, QPU_W_TMU_NOSWAP))
224
last_tmu_noswap = i;
225
}
226
227
/* "An instruction must not read from a location in physical regfile A
228
* or B that was written to by the previous instruction."
229
*/
230
for (int i = 0; i < num_inst - 1; i++) {
231
uint64_t inst = insts[i];
232
uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
233
uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
234
uint32_t waddr_a, waddr_b;
235
236
if (inst & QPU_WS) {
237
waddr_b = add_waddr;
238
waddr_a = mul_waddr;
239
} else {
240
waddr_a = add_waddr;
241
waddr_b = mul_waddr;
242
}
243
244
if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
245
(waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
246
fail_instr(insts[i + 1],
247
"Reads physical reg too soon after write");
248
}
249
}
250
251
/* "After an SFU lookup instruction, accumulator r4 must not be read
252
* in the following two instructions. Any other instruction that
253
* results in r4 being written (that is, TMU read, TLB read, SFU
254
* lookup) cannot occur in the two instructions following an SFU
255
* lookup."
256
*/
257
int last_sfu_inst = -10;
258
for (int i = 0; i < num_inst - 1; i++) {
259
uint64_t inst = insts[i];
260
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
261
262
if (i - last_sfu_inst <= 2 &&
263
(writes_sfu(inst) ||
264
sig == QPU_SIG_LOAD_TMU0 ||
265
sig == QPU_SIG_LOAD_TMU1 ||
266
sig == QPU_SIG_COLOR_LOAD)) {
267
fail_instr(inst, "R4 write too soon after SFU write");
268
}
269
270
if (writes_sfu(inst))
271
last_sfu_inst = i;
272
}
273
274
for (int i = 0; i < num_inst - 1; i++) {
275
uint64_t inst = insts[i];
276
277
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
278
QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
279
QPU_SMALL_IMM_MUL_ROT) {
280
uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
281
uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
282
283
/* "The full horizontal vector rotate is only
284
* available when both of the mul ALU input arguments
285
* are taken from accumulators r0-r3."
286
*/
287
if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
288
fail_instr(inst,
289
"MUL rotate using non-accumulator "
290
"input");
291
}
292
293
if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
294
QPU_SMALL_IMM_MUL_ROT) {
295
/* "An instruction that does a vector rotate
296
* by r5 must not immediately follow an
297
* instruction that writes to r5."
298
*/
299
if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
300
fail_instr(inst,
301
"vector rotate by r5 "
302
"immediately after r5 write");
303
}
304
}
305
306
/* "An instruction that does a vector rotate must not
307
* immediately follow an instruction that writes to the
308
* accumulator that is being rotated."
309
*/
310
if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
311
writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
312
fail_instr(inst,
313
"vector rotate of value "
314
"written in previous instruction");
315
}
316
}
317
}
318
319
/* "An instruction that does a vector rotate must not immediately
320
* follow an instruction that writes to the accumulator that is being
321
* rotated.
322
*
323
* XXX: TODO.
324
*/
325
326
/* "After an instruction that does a TLB Z write, the multisample mask
327
* must not be read as an instruction input argument in the following
328
* two instruction. The TLB Z write instruction can, however, be
329
* followed immediately by a TLB color write."
330
*/
331
for (int i = 0; i < num_inst - 1; i++) {
332
uint64_t inst = insts[i];
333
if (writes_reg(inst, QPU_W_TLB_Z) &&
334
(reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
335
reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
336
fail_instr(inst, "TLB Z write followed by MS mask read");
337
}
338
}
339
340
/*
341
* "A single instruction can only perform a maximum of one of the
342
* following closely coupled peripheral accesses in a single
343
* instruction: TMU write, TMU read, TLB write, TLB read, TLB
344
* combined color read and write, SFU write, Mutex read or Semaphore
345
* access."
346
*/
347
for (int i = 0; i < num_inst - 1; i++) {
348
uint64_t inst = insts[i];
349
350
if (qpu_num_sf_accesses(inst) > 1)
351
fail_instr(inst, "Single instruction writes SFU twice");
352
}
353
354
/* "The uniform base pointer can be written (from SIMD element 0) by
355
* the processor to reset the stream, there must be at least two
356
* nonuniform-accessing instructions following a pointer change
357
* before uniforms can be accessed once more."
358
*/
359
int last_unif_pointer_update = -3;
360
for (int i = 0; i < num_inst; i++) {
361
uint64_t inst = insts[i];
362
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
363
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
364
365
if (reads_reg(inst, QPU_R_UNIF) &&
366
i - last_unif_pointer_update <= 2) {
367
fail_instr(inst,
368
"uniform read too soon after pointer update");
369
}
370
371
if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
372
waddr_mul == QPU_W_UNIFORMS_ADDRESS)
373
last_unif_pointer_update = i;
374
}
375
376
if (threaded) {
377
bool last_thrsw_found = false;
378
bool scoreboard_locked = false;
379
int tex_samples_outstanding = 0;
380
int last_tex_samples_outstanding = 0;
381
int thrsw_ip = -1;
382
383
for (int i = 0; i < num_inst; i++) {
384
uint64_t inst = insts[i];
385
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
386
387
if (i == thrsw_ip) {
388
/* In order to get texture results back in the
389
* correct order, before a new thrsw we have
390
* to read all the texture results from before
391
* the previous thrsw.
392
*
393
* FIXME: Is collecting the remaining results
394
* during the delay slots OK, or should we do
395
* this at THRSW signal time?
396
*/
397
if (last_tex_samples_outstanding != 0) {
398
fail_instr(inst, "THRSW with texture "
399
"results from the previous "
400
"THRSW still in the FIFO.");
401
}
402
403
last_tex_samples_outstanding =
404
tex_samples_outstanding;
405
tex_samples_outstanding = 0;
406
}
407
408
if (qpu_inst_is_tlb(inst))
409
scoreboard_locked = true;
410
411
switch (sig) {
412
case QPU_SIG_THREAD_SWITCH:
413
case QPU_SIG_LAST_THREAD_SWITCH:
414
/* No thread switching with the scoreboard
415
* locked. Doing so means we may deadlock
416
* when the other thread tries to lock
417
* scoreboard.
418
*/
419
if (scoreboard_locked) {
420
fail_instr(inst, "THRSW with the "
421
"scoreboard locked.");
422
}
423
424
/* No thread switching after lthrsw, since
425
* lthrsw means that we get delayed until the
426
* other shader is ready for us to terminate.
427
*/
428
if (last_thrsw_found) {
429
fail_instr(inst, "THRSW after a "
430
"previous LTHRSW");
431
}
432
433
if (sig == QPU_SIG_LAST_THREAD_SWITCH)
434
last_thrsw_found = true;
435
436
/* No THRSW while we already have a THRSW
437
* queued.
438
*/
439
if (i < thrsw_ip) {
440
fail_instr(inst,
441
"THRSW with a THRSW queued.");
442
}
443
444
thrsw_ip = i + 3;
445
break;
446
447
case QPU_SIG_LOAD_TMU0:
448
case QPU_SIG_LOAD_TMU1:
449
if (last_tex_samples_outstanding == 0) {
450
fail_instr(inst, "TMU load with nothing "
451
"in the results fifo from "
452
"the previous THRSW.");
453
}
454
455
last_tex_samples_outstanding--;
456
break;
457
}
458
459
uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
460
uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
461
if (waddr_add == QPU_W_TMU0_S ||
462
waddr_add == QPU_W_TMU1_S ||
463
waddr_mul == QPU_W_TMU0_S ||
464
waddr_mul == QPU_W_TMU1_S) {
465
tex_samples_outstanding++;
466
}
467
}
468
}
469
}
470
471