CoCalc -- vc4_qpu

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_qpu_validate.c
⁴⁵⁷⁰ views
1

2
/*
3
 * Copyright © 2014 Broadcom
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
#include <stdlib.h>
26

27
#include "vc4_qpu.h"
28

29
static void
30
fail_instr(uint64_t inst, const char *msg)
31
{
32
        fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
33
        vc4_qpu_disasm(&inst, 1);
34
        fprintf(stderr, "\n");
35
        abort();
36
}
37

38
static bool
39
writes_reg(uint64_t inst, uint32_t w)
40
{
41
        return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
42
                QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
43
}
44

45
static bool
46
_reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
47
{
48
        struct {
49
                uint32_t mux, addr;
50
        } src_regs[] = {
51
                { QPU_GET_FIELD(inst, QPU_ADD_A) },
52
                { QPU_GET_FIELD(inst, QPU_ADD_B) },
53
                { QPU_GET_FIELD(inst, QPU_MUL_A) },
54
                { QPU_GET_FIELD(inst, QPU_MUL_B) },
55
        };
56

57
        /* Branches only reference raddr_a (no mux), and we don't use that
58
         * feature of branching.
59
         */
60
        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
61
                return false;
62

63
        /* Load immediates don't read any registers. */
64
        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
65
                return false;
66

67
        for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
68
                if (!ignore_a &&
69
                    src_regs[i].mux == QPU_MUX_A &&
70
                    (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
71
                        return true;
72

73
                if (!ignore_b &&
74
                    QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
75
                    src_regs[i].mux == QPU_MUX_B &&
76
                    (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
77
                        return true;
78
        }
79

80
        return false;
81
}
82

83
static bool
84
reads_reg(uint64_t inst, uint32_t r)
85
{
86
        return _reads_reg(inst, r, false, false);
87
}
88

89
static bool
90
reads_a_reg(uint64_t inst, uint32_t r)
91
{
92
        return _reads_reg(inst, r, false, true);
93
}
94

95
static bool
96
reads_b_reg(uint64_t inst, uint32_t r)
97
{
98
        return _reads_reg(inst, r, true, false);
99
}
100

101
static bool
102
writes_sfu(uint64_t inst)
103
{
104
        return (writes_reg(inst, QPU_W_SFU_RECIP) ||
105
                writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
106
                writes_reg(inst, QPU_W_SFU_EXP) ||
107
                writes_reg(inst, QPU_W_SFU_LOG));
108
}
109

110
/**
111
 * Checks for the instruction restrictions from page 37 ("Summary of
112
 * Instruction Restrictions").
113
 */
114
void
115
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
116
{
117
        bool scoreboard_locked = false;
118
        bool threaded = false;
119

120
        /* We don't want to do validation in release builds, but we want to
121
         * keep compiling the validation code to make sure it doesn't get
122
         * broken.
123
         */
124
#ifndef DEBUG
125
        return;
126
#endif
127

128
        for (int i = 0; i < num_inst; i++) {
129
                uint64_t inst = insts[i];
130
                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
131

132
                if (sig != QPU_SIG_PROG_END) {
133
                        if (qpu_inst_is_tlb(inst))
134
                                scoreboard_locked = true;
135

136
                        if (sig == QPU_SIG_THREAD_SWITCH ||
137
                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
138
                                threaded = true;
139
                        }
140

141
                        continue;
142
                }
143

144
                /* "The Thread End instruction must not write to either physical
145
                 *  regfile A or B."
146
                 */
147
                if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
148
                    QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
149
                        fail_instr(inst, "write to phys reg in thread end");
150
                }
151

152
                /* Can't trigger an implicit wait on scoreboard in the program
153
                 * end instruction.
154
                 */
155
                if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
156
                        fail_instr(inst, "implicit sb wait in program end");
157

158
                /* Two delay slots will be executed. */
159
                assert(i + 2 <= num_inst);
160

161
                 for (int j = i; j < i + 2; j++) {
162
                         /* "The last three instructions of any program
163
                          *  (Thread End plus the following two delay-slot
164
                          *  instructions) must not do varyings read, uniforms
165
                          *  read or any kind of VPM, VDR, or VDW read or
166
                          *  write."
167
                          */
168
                         if (writes_reg(insts[j], QPU_W_VPM) ||
169
                             reads_reg(insts[j], QPU_R_VARY) ||
170
                             reads_reg(insts[j], QPU_R_UNIF) ||
171
                             reads_reg(insts[j], QPU_R_VPM)) {
172
                                 fail_instr(insts[j], "last 3 instructions "
173
                                            "using fixed functions");
174
                         }
175

176
                         /* "The Thread End instruction and the following two
177
                          *  delay slot instructions must not write or read
178
                          *  address 14 in either regfile A or B."
179
                          */
180
                         if (writes_reg(insts[j], 14) ||
181
                             reads_reg(insts[j], 14)) {
182
                                 fail_instr(insts[j], "last 3 instructions "
183
                                            "must not use r14");
184
                         }
185
                 }
186

187
                 /* "The final program instruction (the second delay slot
188
                  *  instruction) must not do a TLB Z write."
189
                  */
190
                 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
191
                         fail_instr(insts[i + 2], "final instruction doing "
192
                                    "Z write");
193
                 }
194
        }
195

196
        /* "A scoreboard wait must not occur in the first two instructions of
197
         *  a fragment shader. This is either the explicit Wait for Scoreboard
198
         *  signal or an implicit wait with the first tile-buffer read or
199
         *  write instruction."
200
         */
201
        for (int i = 0; i < 2; i++) {
202
                uint64_t inst = insts[i];
203

204
                if (qpu_inst_is_tlb(inst))
205
                        fail_instr(inst, "sb wait in first two insts");
206
        }
207

208
        /* "If TMU_NOSWAP is written, the write must be three instructions
209
         *  before the first TMU write instruction.  For example, if
210
         *  TMU_NOSWAP is written in the first shader instruction, the first
211
         *  TMU write cannot occur before the 4th shader instruction."
212
         */
213
        int last_tmu_noswap = -10;
214
        for (int i = 0; i < num_inst; i++) {
215
                uint64_t inst = insts[i];
216

217
                if ((i - last_tmu_noswap) <= 3 &&
218
                    (writes_reg(inst, QPU_W_TMU0_S) ||
219
                     writes_reg(inst, QPU_W_TMU1_S))) {
220
                        fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
221
                }
222

223
                if (writes_reg(inst, QPU_W_TMU_NOSWAP))
224
                    last_tmu_noswap = i;
225
        }
226

227
        /* "An instruction must not read from a location in physical regfile A
228
         *  or B that was written to by the previous instruction."
229
         */
230
        for (int i = 0; i < num_inst - 1; i++) {
231
                uint64_t inst = insts[i];
232
                uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
233
                uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
234
                uint32_t waddr_a, waddr_b;
235

236
                if (inst & QPU_WS) {
237
                        waddr_b = add_waddr;
238
                        waddr_a = mul_waddr;
239
                } else {
240
                        waddr_a = add_waddr;
241
                        waddr_b = mul_waddr;
242
                }
243

244
                if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
245
                    (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
246
                        fail_instr(insts[i + 1],
247
                                   "Reads physical reg too soon after write");
248
                }
249
        }
250

251
        /* "After an SFU lookup instruction, accumulator r4 must not be read
252
         *  in the following two instructions. Any other instruction that
253
         *  results in r4 being written (that is, TMU read, TLB read, SFU
254
         *  lookup) cannot occur in the two instructions following an SFU
255
         *  lookup."
256
         */
257
        int last_sfu_inst = -10;
258
        for (int i = 0; i < num_inst - 1; i++) {
259
                uint64_t inst = insts[i];
260
                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
261

262
                if (i - last_sfu_inst <= 2 &&
263
                    (writes_sfu(inst) ||
264
                     sig == QPU_SIG_LOAD_TMU0 ||
265
                     sig == QPU_SIG_LOAD_TMU1 ||
266
                     sig == QPU_SIG_COLOR_LOAD)) {
267
                        fail_instr(inst, "R4 write too soon after SFU write");
268
                }
269

270
                if (writes_sfu(inst))
271
                        last_sfu_inst = i;
272
        }
273

274
        for (int i = 0; i < num_inst - 1; i++) {
275
                uint64_t inst = insts[i];
276

277
                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
278
                    QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
279
                    QPU_SMALL_IMM_MUL_ROT) {
280
                        uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
281
                        uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
282

283
                        /* "The full horizontal vector rotate is only
284
                         *  available when both of the mul ALU input arguments
285
                         *  are taken from accumulators r0-r3."
286
                         */
287
                        if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
288
                                fail_instr(inst,
289
                                           "MUL rotate using non-accumulator "
290
                                           "input");
291
                        }
292

293
                        if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
294
                            QPU_SMALL_IMM_MUL_ROT) {
295
                                /* "An instruction that does a vector rotate
296
                                 *  by r5 must not immediately follow an
297
                                 *  instruction that writes to r5."
298
                                 */
299
                                if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
300
                                        fail_instr(inst,
301
                                                   "vector rotate by r5 "
302
                                                   "immediately after r5 write");
303
                                }
304
                        }
305

306
                        /* "An instruction that does a vector rotate must not
307
                         *  immediately follow an instruction that writes to the
308
                         *  accumulator that is being rotated."
309
                         */
310
                        if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
311
                            writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
312
                                fail_instr(inst,
313
                                           "vector rotate of value "
314
                                           "written in previous instruction");
315
                        }
316
                }
317
        }
318

319
        /* "An instruction that does a vector rotate must not immediately
320
         *  follow an instruction that writes to the accumulator that is being
321
         *  rotated.
322
         *
323
         * XXX: TODO.
324
         */
325

326
        /* "After an instruction that does a TLB Z write, the multisample mask
327
         *  must not be read as an instruction input argument in the following
328
         *  two instruction. The TLB Z write instruction can, however, be
329
         *  followed immediately by a TLB color write."
330
         */
331
        for (int i = 0; i < num_inst - 1; i++) {
332
                uint64_t inst = insts[i];
333
                if (writes_reg(inst, QPU_W_TLB_Z) &&
334
                    (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
335
                     reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
336
                        fail_instr(inst, "TLB Z write followed by MS mask read");
337
                }
338
        }
339

340
        /*
341
         * "A single instruction can only perform a maximum of one of the
342
         *  following closely coupled peripheral accesses in a single
343
         *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
344
         *  combined color read and write, SFU write, Mutex read or Semaphore
345
         *  access."
346
         */
347
        for (int i = 0; i < num_inst - 1; i++) {
348
                uint64_t inst = insts[i];
349

350
                if (qpu_num_sf_accesses(inst) > 1)
351
                        fail_instr(inst, "Single instruction writes SFU twice");
352
        }
353

354
        /* "The uniform base pointer can be written (from SIMD element 0) by
355
         *  the processor to reset the stream, there must be at least two
356
         *  nonuniform-accessing instructions following a pointer change
357
         *  before uniforms can be accessed once more."
358
         */
359
        int last_unif_pointer_update = -3;
360
        for (int i = 0; i < num_inst; i++) {
361
                uint64_t inst = insts[i];
362
                uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
363
                uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
364

365
                if (reads_reg(inst, QPU_R_UNIF) &&
366
                    i - last_unif_pointer_update <= 2) {
367
                        fail_instr(inst,
368
                                   "uniform read too soon after pointer update");
369
                }
370

371
                if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
372
                    waddr_mul == QPU_W_UNIFORMS_ADDRESS)
373
                        last_unif_pointer_update = i;
374
        }
375

376
        if (threaded) {
377
                bool last_thrsw_found = false;
378
                bool scoreboard_locked = false;
379
                int tex_samples_outstanding = 0;
380
                int last_tex_samples_outstanding = 0;
381
                int thrsw_ip = -1;
382

383
                for (int i = 0; i < num_inst; i++) {
384
                        uint64_t inst = insts[i];
385
                        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
386

387
                        if (i == thrsw_ip) {
388
                                /* In order to get texture results back in the
389
                                 * correct order, before a new thrsw we have
390
                                 * to read all the texture results from before
391
                                 * the previous thrsw.
392
                                 *
393
                                 * FIXME: Is collecting the remaining results
394
                                 * during the delay slots OK, or should we do
395
                                 * this at THRSW signal time?
396
                                 */
397
                                if (last_tex_samples_outstanding != 0) {
398
                                        fail_instr(inst, "THRSW with texture "
399
                                                   "results from the previous "
400
                                                   "THRSW still in the FIFO.");
401
                                }
402

403
                                last_tex_samples_outstanding =
404
                                        tex_samples_outstanding;
405
                                tex_samples_outstanding = 0;
406
                        }
407

408
                        if (qpu_inst_is_tlb(inst))
409
                                scoreboard_locked = true;
410

411
                        switch (sig) {
412
                        case QPU_SIG_THREAD_SWITCH:
413
                        case QPU_SIG_LAST_THREAD_SWITCH:
414
                                /* No thread switching with the scoreboard
415
                                 * locked.  Doing so means we may deadlock
416
                                 * when the other thread tries to lock
417
                                 * scoreboard.
418
                                 */
419
                                if (scoreboard_locked) {
420
                                        fail_instr(inst, "THRSW with the "
421
                                                   "scoreboard locked.");
422
                                }
423

424
                                /* No thread switching after lthrsw, since
425
                                 * lthrsw means that we get delayed until the
426
                                 * other shader is ready for us to terminate.
427
                                 */
428
                                if (last_thrsw_found) {
429
                                        fail_instr(inst, "THRSW after a "
430
                                                   "previous LTHRSW");
431
                                }
432

433
                                if (sig == QPU_SIG_LAST_THREAD_SWITCH)
434
                                        last_thrsw_found = true;
435

436
                                /* No THRSW while we already have a THRSW
437
                                 * queued.
438
                                 */
439
                                if (i < thrsw_ip) {
440
                                        fail_instr(inst,
441
                                                   "THRSW with a THRSW queued.");
442
                                }
443

444
                                thrsw_ip = i + 3;
445
                                break;
446

447
                        case QPU_SIG_LOAD_TMU0:
448
                        case QPU_SIG_LOAD_TMU1:
449
                                if (last_tex_samples_outstanding == 0) {
450
                                        fail_instr(inst, "TMU load with nothing "
451
                                                   "in the results fifo from "
452
                                                   "the previous THRSW.");
453
                                }
454

455
                                last_tex_samples_outstanding--;
456
                                break;
457
                        }
458

459
                        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
460
                        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
461
                        if (waddr_add == QPU_W_TMU0_S ||
462
                            waddr_add == QPU_W_TMU1_S ||
463
                            waddr_mul == QPU_W_TMU0_S ||
464
                            waddr_mul == QPU_W_TMU1_S) {
465
                                tex_samples_outstanding++;
466
                        }
467
                }
468
        }
469
}
470

471
Product

Resources

Company