Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/compiler/nir_to_vir.c
4564 views
1
/*
2
* Copyright © 2016 Broadcom
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include <inttypes.h>
25
#include "util/format/u_format.h"
26
#include "util/u_helpers.h"
27
#include "util/u_math.h"
28
#include "util/u_memory.h"
29
#include "util/ralloc.h"
30
#include "util/hash_table.h"
31
#include "compiler/nir/nir.h"
32
#include "compiler/nir/nir_builder.h"
33
#include "common/v3d_device_info.h"
34
#include "v3d_compiler.h"
35
36
/* We don't do any address packing. */
37
#define __gen_user_data void
38
#define __gen_address_type uint32_t
39
#define __gen_address_offset(reloc) (*reloc)
40
#define __gen_emit_reloc(cl, reloc)
41
#include "cle/v3d_packet_v41_pack.h"
42
43
#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
44
#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
45
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
46
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
47
#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
48
#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)
49
#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)
50
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)
51
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
52
#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
53
54
#define V3D_TSY_SET_QUORUM 0
55
#define V3D_TSY_INC_WAITERS 1
56
#define V3D_TSY_DEC_WAITERS 2
57
#define V3D_TSY_INC_QUORUM 3
58
#define V3D_TSY_DEC_QUORUM 4
59
#define V3D_TSY_FREE_ALL 5
60
#define V3D_TSY_RELEASE 6
61
#define V3D_TSY_ACQUIRE 7
62
#define V3D_TSY_WAIT 8
63
#define V3D_TSY_WAIT_INC 9
64
#define V3D_TSY_WAIT_CHECK 10
65
#define V3D_TSY_WAIT_INC_CHECK 11
66
#define V3D_TSY_WAIT_CV 12
67
#define V3D_TSY_INC_SEMAPHORE 13
68
#define V3D_TSY_DEC_SEMAPHORE 14
69
#define V3D_TSY_SET_QUORUM_FREE_ALL 15
70
71
enum v3d_tmu_op_type
72
{
73
V3D_TMU_OP_TYPE_REGULAR,
74
V3D_TMU_OP_TYPE_ATOMIC,
75
V3D_TMU_OP_TYPE_CACHE
76
};
77
78
static enum v3d_tmu_op_type
79
v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80
{
81
switch(tmu_op) {
82
case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83
case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84
case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85
case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86
case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87
return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88
case V3D_TMU_OP_WRITE_UMAX:
89
case V3D_TMU_OP_WRITE_SMIN:
90
case V3D_TMU_OP_WRITE_SMAX:
91
assert(is_write);
92
FALLTHROUGH;
93
case V3D_TMU_OP_WRITE_AND_READ_INC:
94
case V3D_TMU_OP_WRITE_OR_READ_DEC:
95
case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96
return V3D_TMU_OP_TYPE_ATOMIC;
97
case V3D_TMU_OP_REGULAR:
98
return V3D_TMU_OP_TYPE_REGULAR;
99
100
default:
101
unreachable("Unknown tmu_op\n");
102
}
103
}
104
static void
105
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106
107
static void
108
resize_qreg_array(struct v3d_compile *c,
109
struct qreg **regs,
110
uint32_t *size,
111
uint32_t decl_size)
112
{
113
if (*size >= decl_size)
114
return;
115
116
uint32_t old_size = *size;
117
*size = MAX2(*size * 2, decl_size);
118
*regs = reralloc(c, *regs, struct qreg, *size);
119
if (!*regs) {
120
fprintf(stderr, "Malloc failure\n");
121
abort();
122
}
123
124
for (uint32_t i = old_size; i < *size; i++)
125
(*regs)[i] = c->undef;
126
}
127
128
static void
129
resize_interp_array(struct v3d_compile *c,
130
struct v3d_interp_input **regs,
131
uint32_t *size,
132
uint32_t decl_size)
133
{
134
if (*size >= decl_size)
135
return;
136
137
uint32_t old_size = *size;
138
*size = MAX2(*size * 2, decl_size);
139
*regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140
if (!*regs) {
141
fprintf(stderr, "Malloc failure\n");
142
abort();
143
}
144
145
for (uint32_t i = old_size; i < *size; i++) {
146
(*regs)[i].vp = c->undef;
147
(*regs)[i].C = c->undef;
148
}
149
}
150
151
void
152
vir_emit_thrsw(struct v3d_compile *c)
153
{
154
if (c->threads == 1)
155
return;
156
157
/* Always thread switch after each texture operation for now.
158
*
159
* We could do better by batching a bunch of texture fetches up and
160
* then doing one thread switch and collecting all their results
161
* afterward.
162
*/
163
c->last_thrsw = vir_NOP(c);
164
c->last_thrsw->qpu.sig.thrsw = true;
165
c->last_thrsw_at_top_level = !c->in_control_flow;
166
167
/* We need to lock the scoreboard before any tlb acess happens. If this
168
* thread switch comes after we have emitted a tlb load, then it means
169
* that we can't lock on the last thread switch any more.
170
*/
171
if (c->emitted_tlb_load)
172
c->lock_scoreboard_on_first_thrsw = true;
173
}
174
175
uint32_t
176
v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177
{
178
if (nir_src_is_const(instr->src[src])) {
179
int64_t add_val = nir_src_as_int(instr->src[src]);
180
if (add_val == 1)
181
return V3D_TMU_OP_WRITE_AND_READ_INC;
182
else if (add_val == -1)
183
return V3D_TMU_OP_WRITE_OR_READ_DEC;
184
}
185
186
return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187
}
188
189
static uint32_t
190
v3d_general_tmu_op(nir_intrinsic_instr *instr)
191
{
192
switch (instr->intrinsic) {
193
case nir_intrinsic_load_ssbo:
194
case nir_intrinsic_load_ubo:
195
case nir_intrinsic_load_uniform:
196
case nir_intrinsic_load_shared:
197
case nir_intrinsic_load_scratch:
198
case nir_intrinsic_store_ssbo:
199
case nir_intrinsic_store_shared:
200
case nir_intrinsic_store_scratch:
201
return V3D_TMU_OP_REGULAR;
202
case nir_intrinsic_ssbo_atomic_add:
203
return v3d_get_op_for_atomic_add(instr, 2);
204
case nir_intrinsic_shared_atomic_add:
205
return v3d_get_op_for_atomic_add(instr, 1);
206
case nir_intrinsic_ssbo_atomic_imin:
207
case nir_intrinsic_shared_atomic_imin:
208
return V3D_TMU_OP_WRITE_SMIN;
209
case nir_intrinsic_ssbo_atomic_umin:
210
case nir_intrinsic_shared_atomic_umin:
211
return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
212
case nir_intrinsic_ssbo_atomic_imax:
213
case nir_intrinsic_shared_atomic_imax:
214
return V3D_TMU_OP_WRITE_SMAX;
215
case nir_intrinsic_ssbo_atomic_umax:
216
case nir_intrinsic_shared_atomic_umax:
217
return V3D_TMU_OP_WRITE_UMAX;
218
case nir_intrinsic_ssbo_atomic_and:
219
case nir_intrinsic_shared_atomic_and:
220
return V3D_TMU_OP_WRITE_AND_READ_INC;
221
case nir_intrinsic_ssbo_atomic_or:
222
case nir_intrinsic_shared_atomic_or:
223
return V3D_TMU_OP_WRITE_OR_READ_DEC;
224
case nir_intrinsic_ssbo_atomic_xor:
225
case nir_intrinsic_shared_atomic_xor:
226
return V3D_TMU_OP_WRITE_XOR_READ_NOT;
227
case nir_intrinsic_ssbo_atomic_exchange:
228
case nir_intrinsic_shared_atomic_exchange:
229
return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
230
case nir_intrinsic_ssbo_atomic_comp_swap:
231
case nir_intrinsic_shared_atomic_comp_swap:
232
return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
233
default:
234
unreachable("unknown intrinsic op");
235
}
236
}
237
238
/**
239
* Checks if pipelining a new TMU operation requiring 'components' LDTMUs
240
* would overflow the Output TMU fifo.
241
*
242
* It is not allowed to overflow the Output fifo, however, we can overflow
243
* Input and Config fifos. Doing that makes the shader stall, but only for as
244
* long as it needs to be able to continue so it is better for pipelining to
245
* let the QPU stall on these if needed than trying to emit TMU flushes in the
246
* driver.
247
*/
248
bool
249
ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
250
{
251
if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
252
return true;
253
254
return components > 0 &&
255
c->tmu.output_fifo_size + components > 16 / c->threads;
256
}
257
258
/**
259
* Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
260
* popping all TMU fifo entries.
261
*/
262
void
263
ntq_flush_tmu(struct v3d_compile *c)
264
{
265
if (c->tmu.flush_count == 0)
266
return;
267
268
vir_emit_thrsw(c);
269
270
bool emitted_tmuwt = false;
271
for (int i = 0; i < c->tmu.flush_count; i++) {
272
if (c->tmu.flush[i].component_mask > 0) {
273
nir_dest *dest = c->tmu.flush[i].dest;
274
assert(dest);
275
276
for (int j = 0; j < 4; j++) {
277
if (c->tmu.flush[i].component_mask & (1 << j)) {
278
ntq_store_dest(c, dest, j,
279
vir_MOV(c, vir_LDTMU(c)));
280
}
281
}
282
} else if (!emitted_tmuwt) {
283
vir_TMUWT(c);
284
emitted_tmuwt = true;
285
}
286
}
287
288
c->tmu.output_fifo_size = 0;
289
c->tmu.flush_count = 0;
290
_mesa_set_clear(c->tmu.outstanding_regs, NULL);
291
}
292
293
/**
294
* Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
295
* is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
296
* and more specifically, the output fifo, since that can't stall.
297
*/
298
void
299
ntq_add_pending_tmu_flush(struct v3d_compile *c,
300
nir_dest *dest,
301
uint32_t component_mask)
302
{
303
const uint32_t num_components = util_bitcount(component_mask);
304
assert(!ntq_tmu_fifo_overflow(c, num_components));
305
306
if (num_components > 0) {
307
c->tmu.output_fifo_size += num_components;
308
if (!dest->is_ssa)
309
_mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
310
}
311
312
c->tmu.flush[c->tmu.flush_count].dest = dest;
313
c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
314
c->tmu.flush_count++;
315
316
if (c->disable_tmu_pipelining)
317
ntq_flush_tmu(c);
318
else if (c->tmu.flush_count > 1)
319
c->pipelined_any_tmu = true;
320
}
321
322
enum emit_mode {
323
MODE_COUNT = 0,
324
MODE_EMIT,
325
MODE_LAST,
326
};
327
328
/**
329
* For a TMU general store instruction:
330
*
331
* In MODE_COUNT mode, records the number of TMU writes required and flushes
332
* any outstanding TMU operations the instruction depends on, but it doesn't
333
* emit any actual register writes.
334
*
335
* In MODE_EMIT mode, emits the data register writes required by the
336
* instruction.
337
*/
338
static void
339
emit_tmu_general_store_writes(struct v3d_compile *c,
340
enum emit_mode mode,
341
nir_intrinsic_instr *instr,
342
uint32_t base_const_offset,
343
uint32_t *writemask,
344
uint32_t *const_offset,
345
uint32_t *tmu_writes)
346
{
347
struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
348
349
/* Find the first set of consecutive components that
350
* are enabled in the writemask and emit the TMUD
351
* instructions for them.
352
*/
353
assert(*writemask != 0);
354
uint32_t first_component = ffs(*writemask) - 1;
355
uint32_t last_component = first_component;
356
while (*writemask & BITFIELD_BIT(last_component + 1))
357
last_component++;
358
359
assert(first_component <= last_component &&
360
last_component < instr->num_components);
361
362
for (int i = first_component; i <= last_component; i++) {
363
struct qreg data = ntq_get_src(c, instr->src[0], i);
364
if (mode == MODE_COUNT)
365
(*tmu_writes)++;
366
else
367
vir_MOV_dest(c, tmud, data);
368
}
369
370
if (mode == MODE_EMIT) {
371
/* Update the offset for the TMU write based on the
372
* the first component we are writing.
373
*/
374
*const_offset = base_const_offset + first_component * 4;
375
376
/* Clear these components from the writemask */
377
uint32_t written_mask =
378
BITFIELD_RANGE(first_component, *tmu_writes);
379
(*writemask) &= ~written_mask;
380
}
381
}
382
383
/**
384
* For a TMU general atomic instruction:
385
*
386
* In MODE_COUNT mode, records the number of TMU writes required and flushes
387
* any outstanding TMU operations the instruction depends on, but it doesn't
388
* emit any actual register writes.
389
*
390
* In MODE_EMIT mode, emits the data register writes required by the
391
* instruction.
392
*/
393
static void
394
emit_tmu_general_atomic_writes(struct v3d_compile *c,
395
enum emit_mode mode,
396
nir_intrinsic_instr *instr,
397
uint32_t tmu_op,
398
bool has_index,
399
uint32_t *tmu_writes)
400
{
401
struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
402
403
struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
404
if (mode == MODE_COUNT)
405
(*tmu_writes)++;
406
else
407
vir_MOV_dest(c, tmud, data);
408
409
if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
410
data = ntq_get_src(c, instr->src[2 + has_index], 0);
411
if (mode == MODE_COUNT)
412
(*tmu_writes)++;
413
else
414
vir_MOV_dest(c, tmud, data);
415
}
416
}
417
418
/**
419
* For any TMU general instruction:
420
*
421
* In MODE_COUNT mode, records the number of TMU writes required to emit the
422
* address parameter and flushes any outstanding TMU operations the instruction
423
* depends on, but it doesn't emit any actual register writes.
424
*
425
* In MODE_EMIT mode, emits register writes required to emit the address.
426
*/
427
static void
428
emit_tmu_general_address_write(struct v3d_compile *c,
429
enum emit_mode mode,
430
nir_intrinsic_instr *instr,
431
uint32_t config,
432
bool dynamic_src,
433
int offset_src,
434
struct qreg base_offset,
435
uint32_t const_offset,
436
uint32_t *tmu_writes)
437
{
438
if (mode == MODE_COUNT) {
439
(*tmu_writes)++;
440
if (dynamic_src)
441
ntq_get_src(c, instr->src[offset_src], 0);
442
return;
443
}
444
445
if (vir_in_nonuniform_control_flow(c)) {
446
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
447
V3D_QPU_PF_PUSHZ);
448
}
449
450
struct qreg tmua;
451
if (config == ~0)
452
tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
453
else
454
tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
455
456
struct qinst *tmu;
457
if (dynamic_src) {
458
struct qreg offset = base_offset;
459
if (const_offset != 0) {
460
offset = vir_ADD(c, offset,
461
vir_uniform_ui(c, const_offset));
462
}
463
struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
464
tmu = vir_ADD_dest(c, tmua, offset, data);
465
} else {
466
if (const_offset != 0) {
467
tmu = vir_ADD_dest(c, tmua, base_offset,
468
vir_uniform_ui(c, const_offset));
469
} else {
470
tmu = vir_MOV_dest(c, tmua, base_offset);
471
}
472
}
473
474
if (config != ~0) {
475
tmu->uniform =
476
vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
477
}
478
479
if (vir_in_nonuniform_control_flow(c))
480
vir_set_cond(tmu, V3D_QPU_COND_IFA);
481
}
482
483
/**
484
* Implements indirect uniform loads and SSBO accesses through the TMU general
485
* memory access interface.
486
*/
487
static void
488
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
489
bool is_shared_or_scratch)
490
{
491
uint32_t tmu_op = v3d_general_tmu_op(instr);
492
493
/* If we were able to replace atomic_add for an inc/dec, then we
494
* need/can to do things slightly different, like not loading the
495
* amount to add/sub, as that is implicit.
496
*/
497
bool atomic_add_replaced =
498
((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
499
instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
500
(tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
501
tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
502
503
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
504
instr->intrinsic == nir_intrinsic_store_scratch ||
505
instr->intrinsic == nir_intrinsic_store_shared);
506
507
bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
508
instr->intrinsic == nir_intrinsic_load_ubo ||
509
instr->intrinsic == nir_intrinsic_load_ssbo ||
510
instr->intrinsic == nir_intrinsic_load_scratch ||
511
instr->intrinsic == nir_intrinsic_load_shared);
512
513
if (!is_load)
514
c->tmu_dirty_rcl = true;
515
516
bool has_index = !is_shared_or_scratch;
517
518
int offset_src;
519
if (instr->intrinsic == nir_intrinsic_load_uniform) {
520
offset_src = 0;
521
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
522
instr->intrinsic == nir_intrinsic_load_ubo ||
523
instr->intrinsic == nir_intrinsic_load_scratch ||
524
instr->intrinsic == nir_intrinsic_load_shared ||
525
atomic_add_replaced) {
526
offset_src = 0 + has_index;
527
} else if (is_store) {
528
offset_src = 1 + has_index;
529
} else {
530
offset_src = 0 + has_index;
531
}
532
533
bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
534
uint32_t const_offset = 0;
535
if (!dynamic_src)
536
const_offset = nir_src_as_uint(instr->src[offset_src]);
537
538
struct qreg base_offset;
539
if (instr->intrinsic == nir_intrinsic_load_uniform) {
540
const_offset += nir_intrinsic_base(instr);
541
base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
542
v3d_unit_data_create(0, const_offset));
543
const_offset = 0;
544
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
545
uint32_t index = nir_src_as_uint(instr->src[0]);
546
/* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
547
* shifted up by 1 (0 is gallium's constant buffer 0).
548
*/
549
if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
550
index++;
551
552
base_offset =
553
vir_uniform(c, QUNIFORM_UBO_ADDR,
554
v3d_unit_data_create(index, const_offset));
555
const_offset = 0;
556
} else if (is_shared_or_scratch) {
557
/* Shared and scratch variables have no buffer index, and all
558
* start from a common base that we set up at the start of
559
* dispatch.
560
*/
561
if (instr->intrinsic == nir_intrinsic_load_scratch ||
562
instr->intrinsic == nir_intrinsic_store_scratch) {
563
base_offset = c->spill_base;
564
} else {
565
base_offset = c->cs_shared_offset;
566
const_offset += nir_intrinsic_base(instr);
567
}
568
} else {
569
base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
570
nir_src_as_uint(instr->src[is_store ?
571
1 : 0]));
572
}
573
574
/* We are ready to emit TMU register writes now, but before we actually
575
* emit them we need to flush outstanding TMU operations if any of our
576
* writes reads from the result of an outstanding TMU operation before
577
* we start the TMU sequence for this operation, since otherwise the
578
* flush could happen in the middle of the TMU sequence we are about to
579
* emit, which is illegal. To do this we run this logic twice, the
580
* first time it will count required register writes and flush pending
581
* TMU requests if necessary due to a dependency, and the second one
582
* will emit the actual TMU writes.
583
*/
584
const uint32_t dest_components = nir_intrinsic_dest_components(instr);
585
uint32_t base_const_offset = const_offset;
586
uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
587
uint32_t tmu_writes = 0;
588
for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
589
assert(mode == MODE_COUNT || tmu_writes > 0);
590
591
if (is_store) {
592
emit_tmu_general_store_writes(c, mode, instr,
593
base_const_offset,
594
&writemask,
595
&const_offset,
596
&tmu_writes);
597
} else if (!is_load && !atomic_add_replaced) {
598
emit_tmu_general_atomic_writes(c, mode, instr,
599
tmu_op, has_index,
600
&tmu_writes);
601
}
602
603
/* For atomics we use 32bit except for CMPXCHG, that we need
604
* to use VEC2. For the rest of the cases we use the number of
605
* tmud writes we did to decide the type. For cache operations
606
* the type is ignored.
607
*/
608
uint32_t config = 0;
609
if (mode == MODE_EMIT) {
610
uint32_t num_components;
611
if (is_load || atomic_add_replaced) {
612
num_components = instr->num_components;
613
} else {
614
assert(tmu_writes > 0);
615
num_components = tmu_writes - 1;
616
}
617
bool is_atomic =
618
v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
619
V3D_TMU_OP_TYPE_ATOMIC;
620
621
uint32_t perquad =
622
is_load && !vir_in_nonuniform_control_flow(c)
623
? GENERAL_TMU_LOOKUP_PER_QUAD
624
: GENERAL_TMU_LOOKUP_PER_PIXEL;
625
config = 0xffffff00 | tmu_op << 3 | perquad;
626
627
if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
628
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
629
} else if (is_atomic || num_components == 1) {
630
config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
631
} else {
632
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
633
num_components - 2;
634
}
635
}
636
637
emit_tmu_general_address_write(c, mode, instr, config,
638
dynamic_src, offset_src,
639
base_offset, const_offset,
640
&tmu_writes);
641
642
assert(tmu_writes > 0);
643
if (mode == MODE_COUNT) {
644
/* Make sure we won't exceed the 16-entry TMU
645
* fifo if each thread is storing at the same
646
* time.
647
*/
648
while (tmu_writes > 16 / c->threads)
649
c->threads /= 2;
650
651
/* If pipelining this TMU operation would
652
* overflow TMU fifos, we need to flush.
653
*/
654
if (ntq_tmu_fifo_overflow(c, dest_components))
655
ntq_flush_tmu(c);
656
} else {
657
/* Delay emission of the thread switch and
658
* LDTMU/TMUWT until we really need to do it to
659
* improve pipelining.
660
*/
661
const uint32_t component_mask =
662
(1 << dest_components) - 1;
663
ntq_add_pending_tmu_flush(c, &instr->dest,
664
component_mask);
665
}
666
}
667
668
/* nir_lower_wrmasks should've ensured that any writemask on a store
669
* operation only has consecutive bits set, in which case we should've
670
* processed the full writemask above.
671
*/
672
assert(writemask == 0);
673
}
674
675
static struct qreg *
676
ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
677
{
678
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
679
def->num_components);
680
_mesa_hash_table_insert(c->def_ht, def, qregs);
681
return qregs;
682
}
683
684
static bool
685
is_ld_signal(const struct v3d_qpu_sig *sig)
686
{
687
return (sig->ldunif ||
688
sig->ldunifa ||
689
sig->ldunifrf ||
690
sig->ldunifarf ||
691
sig->ldtmu ||
692
sig->ldvary ||
693
sig->ldvpm ||
694
sig->ldtlb ||
695
sig->ldtlbu);
696
}
697
698
static inline bool
699
is_ldunif_signal(const struct v3d_qpu_sig *sig)
700
{
701
return sig->ldunif || sig->ldunifrf;
702
}
703
704
/**
705
* This function is responsible for getting VIR results into the associated
706
* storage for a NIR instruction.
707
*
708
* If it's a NIR SSA def, then we just set the associated hash table entry to
709
* the new result.
710
*
711
* If it's a NIR reg, then we need to update the existing qreg assigned to the
712
* NIR destination with the incoming value. To do that without introducing
713
* new MOVs, we require that the incoming qreg either be a uniform, or be
714
* SSA-defined by the previous VIR instruction in the block and rewritable by
715
* this function. That lets us sneak ahead and insert the SF flag beforehand
716
* (knowing that the previous instruction doesn't depend on flags) and rewrite
717
* its destination to be the NIR reg's destination
718
*/
719
void
720
ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
721
struct qreg result)
722
{
723
struct qinst *last_inst = NULL;
724
if (!list_is_empty(&c->cur_block->instructions))
725
last_inst = (struct qinst *)c->cur_block->instructions.prev;
726
727
bool is_reused_uniform =
728
is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
729
last_inst != c->defs[result.index];
730
731
assert(result.file == QFILE_TEMP && last_inst &&
732
(last_inst == c->defs[result.index] || is_reused_uniform));
733
734
if (dest->is_ssa) {
735
assert(chan < dest->ssa.num_components);
736
737
struct qreg *qregs;
738
struct hash_entry *entry =
739
_mesa_hash_table_search(c->def_ht, &dest->ssa);
740
741
if (entry)
742
qregs = entry->data;
743
else
744
qregs = ntq_init_ssa_def(c, &dest->ssa);
745
746
qregs[chan] = result;
747
} else {
748
nir_register *reg = dest->reg.reg;
749
assert(dest->reg.base_offset == 0);
750
assert(reg->num_array_elems == 0);
751
struct hash_entry *entry =
752
_mesa_hash_table_search(c->def_ht, reg);
753
struct qreg *qregs = entry->data;
754
755
/* If the previous instruction can't be predicated for
756
* the store into the nir_register, then emit a MOV
757
* that can be.
758
*/
759
if (is_reused_uniform ||
760
(vir_in_nonuniform_control_flow(c) &&
761
is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
762
result = vir_MOV(c, result);
763
last_inst = c->defs[result.index];
764
}
765
766
/* We know they're both temps, so just rewrite index. */
767
c->defs[last_inst->dst.index] = NULL;
768
last_inst->dst.index = qregs[chan].index;
769
770
/* If we're in control flow, then make this update of the reg
771
* conditional on the execution mask.
772
*/
773
if (vir_in_nonuniform_control_flow(c)) {
774
last_inst->dst.index = qregs[chan].index;
775
776
/* Set the flags to the current exec mask.
777
*/
778
c->cursor = vir_before_inst(last_inst);
779
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
780
V3D_QPU_PF_PUSHZ);
781
c->cursor = vir_after_inst(last_inst);
782
783
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
784
}
785
}
786
}
787
788
/**
789
* This looks up the qreg associated with a particular ssa/reg used as a source
790
* in any instruction.
791
*
792
* It is expected that the definition for any NIR value read as a source has
793
* been emitted by a previous instruction, however, in the case of TMU
794
* operations we may have postponed emission of the thread switch and LDTMUs
795
* required to read the TMU results until the results are actually used to
796
* improve pipelining, which then would lead to us not finding them here
797
* (for SSA defs) or finding them in the list of registers awaiting a TMU flush
798
* (for registers), meaning that we need to flush outstanding TMU operations
799
* to read the correct value.
800
*/
801
struct qreg
802
ntq_get_src(struct v3d_compile *c, nir_src src, int i)
803
{
804
struct hash_entry *entry;
805
if (src.is_ssa) {
806
assert(i < src.ssa->num_components);
807
808
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
809
if (!entry) {
810
ntq_flush_tmu(c);
811
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
812
}
813
} else {
814
nir_register *reg = src.reg.reg;
815
assert(reg->num_array_elems == 0);
816
assert(src.reg.base_offset == 0);
817
assert(i < reg->num_components);
818
819
if (_mesa_set_search(c->tmu.outstanding_regs, reg))
820
ntq_flush_tmu(c);
821
entry = _mesa_hash_table_search(c->def_ht, reg);
822
}
823
assert(entry);
824
825
struct qreg *qregs = entry->data;
826
return qregs[i];
827
}
828
829
static struct qreg
830
ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
831
unsigned src)
832
{
833
assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
834
unsigned chan = ffs(instr->dest.write_mask) - 1;
835
struct qreg r = ntq_get_src(c, instr->src[src].src,
836
instr->src[src].swizzle[chan]);
837
838
assert(!instr->src[src].abs);
839
assert(!instr->src[src].negate);
840
841
return r;
842
};
843
844
static struct qreg
845
ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
846
{
847
return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
848
}
849
850
static void
851
ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
852
{
853
unsigned unit = instr->texture_index;
854
int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
855
int dest_size = nir_tex_instr_dest_size(instr);
856
857
struct qreg lod = c->undef;
858
if (lod_index != -1)
859
lod = ntq_get_src(c, instr->src[lod_index].src, 0);
860
861
for (int i = 0; i < dest_size; i++) {
862
assert(i < 3);
863
enum quniform_contents contents;
864
865
if (instr->is_array && i == dest_size - 1)
866
contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
867
else
868
contents = QUNIFORM_TEXTURE_WIDTH + i;
869
870
struct qreg size = vir_uniform(c, contents, unit);
871
872
switch (instr->sampler_dim) {
873
case GLSL_SAMPLER_DIM_1D:
874
case GLSL_SAMPLER_DIM_2D:
875
case GLSL_SAMPLER_DIM_MS:
876
case GLSL_SAMPLER_DIM_3D:
877
case GLSL_SAMPLER_DIM_CUBE:
878
case GLSL_SAMPLER_DIM_BUF:
879
/* Don't minify the array size. */
880
if (!(instr->is_array && i == dest_size - 1)) {
881
size = ntq_minify(c, size, lod);
882
}
883
break;
884
885
case GLSL_SAMPLER_DIM_RECT:
886
/* There's no LOD field for rects */
887
break;
888
889
default:
890
unreachable("Bad sampler type");
891
}
892
893
ntq_store_dest(c, &instr->dest, i, size);
894
}
895
}
896
897
static void
898
ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
899
{
900
unsigned unit = instr->texture_index;
901
902
/* Since each texture sampling op requires uploading uniforms to
903
* reference the texture, there's no HW support for texture size and
904
* you just upload uniforms containing the size.
905
*/
906
switch (instr->op) {
907
case nir_texop_query_levels:
908
ntq_store_dest(c, &instr->dest, 0,
909
vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
910
return;
911
case nir_texop_texture_samples:
912
ntq_store_dest(c, &instr->dest, 0,
913
vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
914
return;
915
case nir_texop_txs:
916
ntq_emit_txs(c, instr);
917
return;
918
default:
919
break;
920
}
921
922
if (c->devinfo->ver >= 40)
923
v3d40_vir_emit_tex(c, instr);
924
else
925
v3d33_vir_emit_tex(c, instr);
926
}
927
928
static struct qreg
929
ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
930
{
931
struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
932
if (is_cos)
933
input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
934
935
struct qreg periods = vir_FROUND(c, input);
936
struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
937
return vir_XOR(c, sin_output, vir_SHL(c,
938
vir_FTOIN(c, periods),
939
vir_uniform_ui(c, -1)));
940
}
941
942
static struct qreg
943
ntq_fsign(struct v3d_compile *c, struct qreg src)
944
{
945
struct qreg t = vir_get_temp(c);
946
947
vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
948
vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
949
vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
950
vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
951
vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
952
return vir_MOV(c, t);
953
}
954
955
static void
956
emit_fragcoord_input(struct v3d_compile *c, int attr)
957
{
958
c->inputs[attr * 4 + 0] = vir_FXCD(c);
959
c->inputs[attr * 4 + 1] = vir_FYCD(c);
960
c->inputs[attr * 4 + 2] = c->payload_z;
961
c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
962
}
963
964
static struct qreg
965
emit_smooth_varying(struct v3d_compile *c,
966
struct qreg vary, struct qreg w, struct qreg r5)
967
{
968
return vir_FADD(c, vir_FMUL(c, vary, w), r5);
969
}
970
971
static struct qreg
972
emit_noperspective_varying(struct v3d_compile *c,
973
struct qreg vary, struct qreg r5)
974
{
975
return vir_FADD(c, vir_MOV(c, vary), r5);
976
}
977
978
static struct qreg
979
emit_flat_varying(struct v3d_compile *c,
980
struct qreg vary, struct qreg r5)
981
{
982
vir_MOV_dest(c, c->undef, vary);
983
return vir_MOV(c, r5);
984
}
985
986
static struct qreg
987
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
988
int8_t input_idx, uint8_t swizzle, int array_index)
989
{
990
struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
991
struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
992
993
struct qinst *ldvary = NULL;
994
struct qreg vary;
995
if (c->devinfo->ver >= 41) {
996
ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
997
c->undef, c->undef);
998
ldvary->qpu.sig.ldvary = true;
999
vary = vir_emit_def(c, ldvary);
1000
} else {
1001
vir_NOP(c)->qpu.sig.ldvary = true;
1002
vary = r3;
1003
}
1004
1005
/* Store the input value before interpolation so we can implement
1006
* GLSL's interpolateAt functions if the shader uses them.
1007
*/
1008
if (input_idx >= 0) {
1009
assert(var);
1010
c->interp[input_idx].vp = vary;
1011
c->interp[input_idx].C = vir_MOV(c, r5);
1012
c->interp[input_idx].mode = var->data.interpolation;
1013
}
1014
1015
/* For gl_PointCoord input or distance along a line, we'll be called
1016
* with no nir_variable, and we don't count toward VPM size so we
1017
* don't track an input slot.
1018
*/
1019
if (!var) {
1020
assert(input_idx < 0);
1021
return emit_smooth_varying(c, vary, c->payload_w, r5);
1022
}
1023
1024
int i = c->num_inputs++;
1025
c->input_slots[i] =
1026
v3d_slot_from_slot_and_component(var->data.location +
1027
array_index, swizzle);
1028
1029
struct qreg result;
1030
switch (var->data.interpolation) {
1031
case INTERP_MODE_NONE:
1032
case INTERP_MODE_SMOOTH:
1033
if (var->data.centroid) {
1034
BITSET_SET(c->centroid_flags, i);
1035
result = emit_smooth_varying(c, vary,
1036
c->payload_w_centroid, r5);
1037
} else {
1038
result = emit_smooth_varying(c, vary, c->payload_w, r5);
1039
}
1040
break;
1041
1042
case INTERP_MODE_NOPERSPECTIVE:
1043
BITSET_SET(c->noperspective_flags, i);
1044
result = emit_noperspective_varying(c, vary, r5);
1045
break;
1046
1047
case INTERP_MODE_FLAT:
1048
BITSET_SET(c->flat_shade_flags, i);
1049
result = emit_flat_varying(c, vary, r5);
1050
break;
1051
1052
default:
1053
unreachable("Bad interp mode");
1054
}
1055
1056
if (input_idx >= 0)
1057
c->inputs[input_idx] = result;
1058
return result;
1059
}
1060
1061
static void
1062
emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1063
int array_index, unsigned nelem)
1064
{
1065
for (int i = 0; i < nelem ; i++) {
1066
int chan = var->data.location_frac + i;
1067
int input_idx = (base_attr + array_index) * 4 + chan;
1068
emit_fragment_varying(c, var, input_idx, chan, array_index);
1069
}
1070
}
1071
1072
static void
1073
emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1074
int array_index)
1075
{
1076
/* Compact variables are scalar arrays where each set of 4 elements
1077
* consumes a single location.
1078
*/
1079
int loc_offset = array_index / 4;
1080
int chan = var->data.location_frac + array_index % 4;
1081
int input_idx = (attr + loc_offset) * 4 + chan;
1082
emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1083
}
1084
1085
static void
1086
add_output(struct v3d_compile *c,
1087
uint32_t decl_offset,
1088
uint8_t slot,
1089
uint8_t swizzle)
1090
{
1091
uint32_t old_array_size = c->outputs_array_size;
1092
resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1093
decl_offset + 1);
1094
1095
if (old_array_size != c->outputs_array_size) {
1096
c->output_slots = reralloc(c,
1097
c->output_slots,
1098
struct v3d_varying_slot,
1099
c->outputs_array_size);
1100
}
1101
1102
c->output_slots[decl_offset] =
1103
v3d_slot_from_slot_and_component(slot, swizzle);
1104
}
1105
1106
/**
1107
* If compare_instr is a valid comparison instruction, emits the
1108
* compare_instr's comparison and returns the sel_instr's return value based
1109
* on the compare_instr's result.
1110
*/
1111
static bool
1112
ntq_emit_comparison(struct v3d_compile *c,
1113
nir_alu_instr *compare_instr,
1114
enum v3d_qpu_cond *out_cond)
1115
{
1116
struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1117
struct qreg src1;
1118
if (nir_op_infos[compare_instr->op].num_inputs > 1)
1119
src1 = ntq_get_alu_src(c, compare_instr, 1);
1120
bool cond_invert = false;
1121
struct qreg nop = vir_nop_reg();
1122
1123
switch (compare_instr->op) {
1124
case nir_op_feq32:
1125
case nir_op_seq:
1126
vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1127
break;
1128
case nir_op_ieq32:
1129
vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1130
break;
1131
1132
case nir_op_fneu32:
1133
case nir_op_sne:
1134
vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1135
cond_invert = true;
1136
break;
1137
case nir_op_ine32:
1138
vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1139
cond_invert = true;
1140
break;
1141
1142
case nir_op_fge32:
1143
case nir_op_sge:
1144
vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1145
break;
1146
case nir_op_ige32:
1147
vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1148
cond_invert = true;
1149
break;
1150
case nir_op_uge32:
1151
vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1152
cond_invert = true;
1153
break;
1154
1155
case nir_op_slt:
1156
case nir_op_flt32:
1157
vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1158
break;
1159
case nir_op_ilt32:
1160
vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1161
break;
1162
case nir_op_ult32:
1163
vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1164
break;
1165
1166
case nir_op_i2b32:
1167
vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1168
cond_invert = true;
1169
break;
1170
1171
case nir_op_f2b32:
1172
vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1173
cond_invert = true;
1174
break;
1175
1176
default:
1177
return false;
1178
}
1179
1180
*out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1181
1182
return true;
1183
}
1184
1185
/* Finds an ALU instruction that generates our src value that could
1186
* (potentially) be greedily emitted in the consuming instruction.
1187
*/
1188
static struct nir_alu_instr *
1189
ntq_get_alu_parent(nir_src src)
1190
{
1191
if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
1192
return NULL;
1193
nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1194
if (!instr)
1195
return NULL;
1196
1197
/* If the ALU instr's srcs are non-SSA, then we would have to avoid
1198
* moving emission of the ALU instr down past another write of the
1199
* src.
1200
*/
1201
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1202
if (!instr->src[i].src.is_ssa)
1203
return NULL;
1204
}
1205
1206
return instr;
1207
}
1208
1209
/* Turns a NIR bool into a condition code to predicate on. */
1210
static enum v3d_qpu_cond
1211
ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1212
{
1213
struct qreg qsrc = ntq_get_src(c, src, 0);
1214
/* skip if we already have src in the flags */
1215
if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1216
return c->flags_cond;
1217
1218
nir_alu_instr *compare = ntq_get_alu_parent(src);
1219
if (!compare)
1220
goto out;
1221
1222
enum v3d_qpu_cond cond;
1223
if (ntq_emit_comparison(c, compare, &cond))
1224
return cond;
1225
1226
out:
1227
1228
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1229
V3D_QPU_PF_PUSHZ);
1230
return V3D_QPU_COND_IFNA;
1231
}
1232
1233
static struct qreg
1234
ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1235
{
1236
struct qreg result =
1237
vir_MOV(c, vir_SEL(c, cond,
1238
vir_uniform_ui(c, ~0),
1239
vir_uniform_ui(c, 0)));
1240
c->flags_temp = result.index;
1241
c->flags_cond = cond;
1242
return result;
1243
}
1244
1245
static void
1246
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1247
{
1248
/* This should always be lowered to ALU operations for V3D. */
1249
assert(!instr->dest.saturate);
1250
1251
/* Vectors are special in that they have non-scalarized writemasks,
1252
* and just take the first swizzle channel for each argument in order
1253
* into each writemask channel.
1254
*/
1255
if (instr->op == nir_op_vec2 ||
1256
instr->op == nir_op_vec3 ||
1257
instr->op == nir_op_vec4) {
1258
struct qreg srcs[4];
1259
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1260
srcs[i] = ntq_get_src(c, instr->src[i].src,
1261
instr->src[i].swizzle[0]);
1262
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1263
ntq_store_dest(c, &instr->dest.dest, i,
1264
vir_MOV(c, srcs[i]));
1265
return;
1266
}
1267
1268
/* General case: We can just grab the one used channel per src. */
1269
struct qreg src[nir_op_infos[instr->op].num_inputs];
1270
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1271
src[i] = ntq_get_alu_src(c, instr, i);
1272
}
1273
1274
struct qreg result;
1275
1276
switch (instr->op) {
1277
case nir_op_mov:
1278
result = vir_MOV(c, src[0]);
1279
break;
1280
1281
case nir_op_fneg:
1282
result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
1283
break;
1284
case nir_op_ineg:
1285
result = vir_NEG(c, src[0]);
1286
break;
1287
1288
case nir_op_fmul:
1289
result = vir_FMUL(c, src[0], src[1]);
1290
break;
1291
case nir_op_fadd:
1292
result = vir_FADD(c, src[0], src[1]);
1293
break;
1294
case nir_op_fsub:
1295
result = vir_FSUB(c, src[0], src[1]);
1296
break;
1297
case nir_op_fmin:
1298
result = vir_FMIN(c, src[0], src[1]);
1299
break;
1300
case nir_op_fmax:
1301
result = vir_FMAX(c, src[0], src[1]);
1302
break;
1303
1304
case nir_op_f2i32: {
1305
nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1306
if (src0_alu && src0_alu->op == nir_op_fround_even) {
1307
result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1308
} else {
1309
result = vir_FTOIZ(c, src[0]);
1310
}
1311
break;
1312
}
1313
1314
case nir_op_f2u32:
1315
result = vir_FTOUZ(c, src[0]);
1316
break;
1317
case nir_op_i2f32:
1318
result = vir_ITOF(c, src[0]);
1319
break;
1320
case nir_op_u2f32:
1321
result = vir_UTOF(c, src[0]);
1322
break;
1323
case nir_op_b2f32:
1324
result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1325
break;
1326
case nir_op_b2i32:
1327
result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1328
break;
1329
1330
case nir_op_iadd:
1331
result = vir_ADD(c, src[0], src[1]);
1332
break;
1333
case nir_op_ushr:
1334
result = vir_SHR(c, src[0], src[1]);
1335
break;
1336
case nir_op_isub:
1337
result = vir_SUB(c, src[0], src[1]);
1338
break;
1339
case nir_op_ishr:
1340
result = vir_ASR(c, src[0], src[1]);
1341
break;
1342
case nir_op_ishl:
1343
result = vir_SHL(c, src[0], src[1]);
1344
break;
1345
case nir_op_imin:
1346
result = vir_MIN(c, src[0], src[1]);
1347
break;
1348
case nir_op_umin:
1349
result = vir_UMIN(c, src[0], src[1]);
1350
break;
1351
case nir_op_imax:
1352
result = vir_MAX(c, src[0], src[1]);
1353
break;
1354
case nir_op_umax:
1355
result = vir_UMAX(c, src[0], src[1]);
1356
break;
1357
case nir_op_iand:
1358
result = vir_AND(c, src[0], src[1]);
1359
break;
1360
case nir_op_ior:
1361
result = vir_OR(c, src[0], src[1]);
1362
break;
1363
case nir_op_ixor:
1364
result = vir_XOR(c, src[0], src[1]);
1365
break;
1366
case nir_op_inot:
1367
result = vir_NOT(c, src[0]);
1368
break;
1369
1370
case nir_op_ufind_msb:
1371
result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1372
break;
1373
1374
case nir_op_imul:
1375
result = vir_UMUL(c, src[0], src[1]);
1376
break;
1377
1378
case nir_op_seq:
1379
case nir_op_sne:
1380
case nir_op_sge:
1381
case nir_op_slt: {
1382
enum v3d_qpu_cond cond;
1383
ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1384
assert(ok);
1385
result = vir_MOV(c, vir_SEL(c, cond,
1386
vir_uniform_f(c, 1.0),
1387
vir_uniform_f(c, 0.0)));
1388
c->flags_temp = result.index;
1389
c->flags_cond = cond;
1390
break;
1391
}
1392
1393
case nir_op_i2b32:
1394
case nir_op_f2b32:
1395
case nir_op_feq32:
1396
case nir_op_fneu32:
1397
case nir_op_fge32:
1398
case nir_op_flt32:
1399
case nir_op_ieq32:
1400
case nir_op_ine32:
1401
case nir_op_ige32:
1402
case nir_op_uge32:
1403
case nir_op_ilt32:
1404
case nir_op_ult32: {
1405
enum v3d_qpu_cond cond;
1406
ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1407
assert(ok);
1408
result = ntq_emit_cond_to_bool(c, cond);
1409
break;
1410
}
1411
1412
case nir_op_b32csel:
1413
result = vir_MOV(c,
1414
vir_SEL(c,
1415
ntq_emit_bool_to_cond(c, instr->src[0].src),
1416
src[1], src[2]));
1417
break;
1418
1419
case nir_op_fcsel:
1420
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1421
V3D_QPU_PF_PUSHZ);
1422
result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1423
src[1], src[2]));
1424
break;
1425
1426
case nir_op_frcp:
1427
result = vir_RECIP(c, src[0]);
1428
break;
1429
case nir_op_frsq:
1430
result = vir_RSQRT(c, src[0]);
1431
break;
1432
case nir_op_fexp2:
1433
result = vir_EXP(c, src[0]);
1434
break;
1435
case nir_op_flog2:
1436
result = vir_LOG(c, src[0]);
1437
break;
1438
1439
case nir_op_fceil:
1440
result = vir_FCEIL(c, src[0]);
1441
break;
1442
case nir_op_ffloor:
1443
result = vir_FFLOOR(c, src[0]);
1444
break;
1445
case nir_op_fround_even:
1446
result = vir_FROUND(c, src[0]);
1447
break;
1448
case nir_op_ftrunc:
1449
result = vir_FTRUNC(c, src[0]);
1450
break;
1451
1452
case nir_op_fsin:
1453
result = ntq_fsincos(c, src[0], false);
1454
break;
1455
case nir_op_fcos:
1456
result = ntq_fsincos(c, src[0], true);
1457
break;
1458
1459
case nir_op_fsign:
1460
result = ntq_fsign(c, src[0]);
1461
break;
1462
1463
case nir_op_fabs: {
1464
result = vir_FMOV(c, src[0]);
1465
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1466
break;
1467
}
1468
1469
case nir_op_iabs:
1470
result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1471
break;
1472
1473
case nir_op_fddx:
1474
case nir_op_fddx_coarse:
1475
case nir_op_fddx_fine:
1476
result = vir_FDX(c, src[0]);
1477
break;
1478
1479
case nir_op_fddy:
1480
case nir_op_fddy_coarse:
1481
case nir_op_fddy_fine:
1482
result = vir_FDY(c, src[0]);
1483
break;
1484
1485
case nir_op_uadd_carry:
1486
vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1487
V3D_QPU_PF_PUSHC);
1488
result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
1489
break;
1490
1491
case nir_op_pack_half_2x16_split:
1492
result = vir_VFPACK(c, src[0], src[1]);
1493
break;
1494
1495
case nir_op_unpack_half_2x16_split_x:
1496
result = vir_FMOV(c, src[0]);
1497
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1498
break;
1499
1500
case nir_op_unpack_half_2x16_split_y:
1501
result = vir_FMOV(c, src[0]);
1502
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1503
break;
1504
1505
case nir_op_fquantize2f16: {
1506
/* F32 -> F16 -> F32 conversion */
1507
struct qreg tmp = vir_FMOV(c, src[0]);
1508
vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
1509
tmp = vir_FMOV(c, tmp);
1510
vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
1511
1512
/* Check for denorm */
1513
struct qreg abs_src = vir_FMOV(c, src[0]);
1514
vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
1515
struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
1516
vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
1517
V3D_QPU_PF_PUSHC);
1518
1519
/* Return +/-0 for denorms */
1520
struct qreg zero =
1521
vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
1522
result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
1523
break;
1524
}
1525
1526
default:
1527
fprintf(stderr, "unknown NIR ALU inst: ");
1528
nir_print_instr(&instr->instr, stderr);
1529
fprintf(stderr, "\n");
1530
abort();
1531
}
1532
1533
/* We have a scalar result, so the instruction should only have a
1534
* single channel written to.
1535
*/
1536
assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1537
ntq_store_dest(c, &instr->dest.dest,
1538
ffs(instr->dest.write_mask) - 1, result);
1539
}
1540
1541
/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1542
* specifier. They come from a register that's preloaded with 0xffffffff
1543
* (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1544
* 8 bits are shifted off the bottom and 0xff shifted in from the top.
1545
*/
1546
#define TLB_TYPE_F16_COLOR (3 << 6)
1547
#define TLB_TYPE_I32_COLOR (1 << 6)
1548
#define TLB_TYPE_F32_COLOR (0 << 6)
1549
#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1550
#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1551
#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1552
#define TLB_F16_SWAP_HI_LO (1 << 1)
1553
#define TLB_VEC_SIZE_4_F16 (1 << 0)
1554
#define TLB_VEC_SIZE_2_F16 (0 << 0)
1555
#define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1556
1557
/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1558
* flag is set.
1559
*/
1560
#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1561
#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1562
#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1563
#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */
1564
#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */
1565
1566
/* Stencil is a single 32-bit write. */
1567
#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1568
1569
static void
1570
vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1571
{
1572
if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1573
return;
1574
1575
struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1576
struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1577
1578
nir_variable *var = c->output_color_var[rt];
1579
int num_components = glsl_get_vector_elements(var->type);
1580
uint32_t conf = 0xffffff00;
1581
struct qinst *inst;
1582
1583
conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1584
TLB_SAMPLE_MODE_PER_PIXEL;
1585
conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1586
1587
if (c->fs_key->swap_color_rb & (1 << rt))
1588
num_components = MAX2(num_components, 3);
1589
assert(num_components != 0);
1590
1591
enum glsl_base_type type = glsl_get_base_type(var->type);
1592
bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1593
bool is_32b_tlb_format = is_int_format ||
1594
(c->fs_key->f32_color_rb & (1 << rt));
1595
1596
if (is_int_format) {
1597
/* The F32 vs I32 distinction was dropped in 4.2. */
1598
if (c->devinfo->ver < 42)
1599
conf |= TLB_TYPE_I32_COLOR;
1600
else
1601
conf |= TLB_TYPE_F32_COLOR;
1602
conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1603
} else {
1604
if (c->fs_key->f32_color_rb & (1 << rt)) {
1605
conf |= TLB_TYPE_F32_COLOR;
1606
conf |= ((num_components - 1) <<
1607
TLB_VEC_SIZE_MINUS_1_SHIFT);
1608
} else {
1609
conf |= TLB_TYPE_F16_COLOR;
1610
conf |= TLB_F16_SWAP_HI_LO;
1611
if (num_components >= 3)
1612
conf |= TLB_VEC_SIZE_4_F16;
1613
else
1614
conf |= TLB_VEC_SIZE_2_F16;
1615
}
1616
}
1617
1618
int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1619
for (int i = 0; i < num_samples; i++) {
1620
struct qreg *color = c->msaa_per_sample_output ?
1621
&c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1622
&c->outputs[var->data.driver_location * 4];
1623
1624
struct qreg r = color[0];
1625
struct qreg g = color[1];
1626
struct qreg b = color[2];
1627
struct qreg a = color[3];
1628
1629
if (c->fs_key->swap_color_rb & (1 << rt)) {
1630
r = color[2];
1631
b = color[0];
1632
}
1633
1634
if (c->fs_key->sample_alpha_to_one)
1635
a = vir_uniform_f(c, 1.0);
1636
1637
if (is_32b_tlb_format) {
1638
if (i == 0) {
1639
inst = vir_MOV_dest(c, tlbu_reg, r);
1640
inst->uniform =
1641
vir_get_uniform_index(c,
1642
QUNIFORM_CONSTANT,
1643
conf);
1644
} else {
1645
vir_MOV_dest(c, tlb_reg, r);
1646
}
1647
1648
if (num_components >= 2)
1649
vir_MOV_dest(c, tlb_reg, g);
1650
if (num_components >= 3)
1651
vir_MOV_dest(c, tlb_reg, b);
1652
if (num_components >= 4)
1653
vir_MOV_dest(c, tlb_reg, a);
1654
} else {
1655
inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1656
if (conf != ~0 && i == 0) {
1657
inst->dst = tlbu_reg;
1658
inst->uniform =
1659
vir_get_uniform_index(c,
1660
QUNIFORM_CONSTANT,
1661
conf);
1662
}
1663
1664
if (num_components >= 3)
1665
vir_VFPACK_dest(c, tlb_reg, b, a);
1666
}
1667
}
1668
}
1669
1670
static void
1671
emit_frag_end(struct v3d_compile *c)
1672
{
1673
if (c->output_sample_mask_index != -1) {
1674
vir_SETMSF_dest(c, vir_nop_reg(),
1675
vir_AND(c,
1676
vir_MSF(c),
1677
c->outputs[c->output_sample_mask_index]));
1678
}
1679
1680
bool has_any_tlb_color_write = false;
1681
for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1682
if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1683
has_any_tlb_color_write = true;
1684
}
1685
1686
if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1687
struct nir_variable *var = c->output_color_var[0];
1688
struct qreg *color = &c->outputs[var->data.driver_location * 4];
1689
1690
vir_SETMSF_dest(c, vir_nop_reg(),
1691
vir_AND(c,
1692
vir_MSF(c),
1693
vir_FTOC(c, color[3])));
1694
}
1695
1696
struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1697
if (c->output_position_index != -1) {
1698
struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1699
c->outputs[c->output_position_index]);
1700
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1701
1702
if (c->devinfo->ver >= 42) {
1703
tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1704
TLB_SAMPLE_MODE_PER_PIXEL);
1705
} else
1706
tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1707
1708
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1709
tlb_specifier |
1710
0xffffff00);
1711
c->writes_z = true;
1712
} else if (c->s->info.fs.uses_discard ||
1713
!c->s->info.fs.early_fragment_tests ||
1714
c->fs_key->sample_alpha_to_coverage ||
1715
!has_any_tlb_color_write) {
1716
/* Emit passthrough Z if it needed to be delayed until shader
1717
* end due to potential discards.
1718
*
1719
* Since (single-threaded) fragment shaders always need a TLB
1720
* write, emit passthrouh Z if we didn't have any color
1721
* buffers and flag us as potentially discarding, so that we
1722
* can use Z as the TLB write.
1723
*/
1724
c->s->info.fs.uses_discard = true;
1725
1726
struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1727
vir_nop_reg());
1728
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1729
1730
if (c->devinfo->ver >= 42) {
1731
/* The spec says the PER_PIXEL flag is ignored for
1732
* invariant writes, but the simulator demands it.
1733
*/
1734
tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1735
TLB_SAMPLE_MODE_PER_PIXEL);
1736
} else {
1737
tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1738
}
1739
1740
inst->uniform = vir_get_uniform_index(c,
1741
QUNIFORM_CONSTANT,
1742
tlb_specifier |
1743
0xffffff00);
1744
c->writes_z = true;
1745
}
1746
1747
/* XXX: Performance improvement: Merge Z write and color writes TLB
1748
* uniform setup
1749
*/
1750
for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
1751
vir_emit_tlb_color_write(c, rt);
1752
}
1753
1754
static inline void
1755
vir_VPM_WRITE_indirect(struct v3d_compile *c,
1756
struct qreg val,
1757
struct qreg vpm_index,
1758
bool uniform_vpm_index)
1759
{
1760
assert(c->devinfo->ver >= 40);
1761
if (uniform_vpm_index)
1762
vir_STVPMV(c, vpm_index, val);
1763
else
1764
vir_STVPMD(c, vpm_index, val);
1765
}
1766
1767
static void
1768
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
1769
{
1770
if (c->devinfo->ver >= 40) {
1771
vir_VPM_WRITE_indirect(c, val,
1772
vir_uniform_ui(c, vpm_index), true);
1773
} else {
1774
/* XXX: v3d33_vir_vpm_write_setup(c); */
1775
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1776
}
1777
}
1778
1779
static void
1780
emit_vert_end(struct v3d_compile *c)
1781
{
1782
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
1783
*/
1784
if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1785
vir_VPMWT(c);
1786
}
1787
1788
static void
1789
emit_geom_end(struct v3d_compile *c)
1790
{
1791
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
1792
*/
1793
if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1794
vir_VPMWT(c);
1795
}
1796
1797
static bool
1798
mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
1799
unsigned bit_size,
1800
unsigned num_components,
1801
nir_intrinsic_instr *low,
1802
nir_intrinsic_instr *high,
1803
void *data)
1804
{
1805
/* Our backend is 32-bit only at present */
1806
if (bit_size != 32)
1807
return false;
1808
1809
if (align_mul % 4 != 0 || align_offset % 4 != 0)
1810
return false;
1811
1812
/* Vector accesses wrap at 16-byte boundaries so we can't vectorize
1813
* if the resulting vector crosses a 16-byte boundary.
1814
*/
1815
assert(util_is_power_of_two_nonzero(align_mul));
1816
align_mul = MIN2(align_mul, 16);
1817
align_offset &= 0xf;
1818
if (16 - align_mul + align_offset + num_components * 4 > 16)
1819
return false;
1820
1821
return true;
1822
}
1823
1824
void
1825
v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
1826
{
1827
bool progress;
1828
unsigned lower_flrp =
1829
(s->options->lower_flrp16 ? 16 : 0) |
1830
(s->options->lower_flrp32 ? 32 : 0) |
1831
(s->options->lower_flrp64 ? 64 : 0);
1832
1833
do {
1834
progress = false;
1835
1836
NIR_PASS_V(s, nir_lower_vars_to_ssa);
1837
NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1838
NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1839
NIR_PASS(progress, s, nir_copy_prop);
1840
NIR_PASS(progress, s, nir_opt_remove_phis);
1841
NIR_PASS(progress, s, nir_opt_dce);
1842
NIR_PASS(progress, s, nir_opt_dead_cf);
1843
NIR_PASS(progress, s, nir_opt_cse);
1844
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1845
NIR_PASS(progress, s, nir_opt_algebraic);
1846
NIR_PASS(progress, s, nir_opt_constant_folding);
1847
1848
nir_load_store_vectorize_options vectorize_opts = {
1849
.modes = nir_var_mem_ssbo | nir_var_mem_ubo |
1850
nir_var_mem_push_const | nir_var_mem_shared |
1851
nir_var_mem_global,
1852
.callback = mem_vectorize_callback,
1853
.robust_modes = 0,
1854
};
1855
NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
1856
1857
if (lower_flrp != 0) {
1858
bool lower_flrp_progress = false;
1859
1860
NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1861
lower_flrp,
1862
false /* always_precise */);
1863
if (lower_flrp_progress) {
1864
NIR_PASS(progress, s, nir_opt_constant_folding);
1865
progress = true;
1866
}
1867
1868
/* Nothing should rematerialize any flrps, so we only
1869
* need to do this lowering once.
1870
*/
1871
lower_flrp = 0;
1872
}
1873
1874
NIR_PASS(progress, s, nir_opt_undef);
1875
NIR_PASS(progress, s, nir_lower_undef_to_zero);
1876
1877
if (c && !c->disable_loop_unrolling &&
1878
s->options->max_unroll_iterations > 0) {
1879
bool local_progress = false;
1880
NIR_PASS(local_progress, s, nir_opt_loop_unroll,
1881
nir_var_shader_in |
1882
nir_var_function_temp);
1883
c->unrolled_any_loops |= local_progress;
1884
progress |= local_progress;
1885
}
1886
} while (progress);
1887
1888
nir_move_options sink_opts =
1889
nir_move_const_undef | nir_move_comparisons | nir_move_copies |
1890
nir_move_load_ubo;
1891
NIR_PASS(progress, s, nir_opt_sink, sink_opts);
1892
1893
NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
1894
}
1895
1896
static int
1897
driver_location_compare(const nir_variable *a, const nir_variable *b)
1898
{
1899
return a->data.driver_location == b->data.driver_location ?
1900
a->data.location_frac - b->data.location_frac :
1901
a->data.driver_location - b->data.driver_location;
1902
}
1903
1904
static struct qreg
1905
ntq_emit_vpm_read(struct v3d_compile *c,
1906
uint32_t *num_components_queued,
1907
uint32_t *remaining,
1908
uint32_t vpm_index)
1909
{
1910
struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1911
1912
if (c->devinfo->ver >= 40 ) {
1913
return vir_LDVPMV_IN(c,
1914
vir_uniform_ui(c,
1915
(*num_components_queued)++));
1916
}
1917
1918
if (*num_components_queued != 0) {
1919
(*num_components_queued)--;
1920
return vir_MOV(c, vpm);
1921
}
1922
1923
uint32_t num_components = MIN2(*remaining, 32);
1924
1925
v3d33_vir_vpm_read_setup(c, num_components);
1926
1927
*num_components_queued = num_components - 1;
1928
*remaining -= num_components;
1929
1930
return vir_MOV(c, vpm);
1931
}
1932
1933
static void
1934
ntq_setup_vs_inputs(struct v3d_compile *c)
1935
{
1936
/* Figure out how many components of each vertex attribute the shader
1937
* uses. Each variable should have been split to individual
1938
* components and unused ones DCEed. The vertex fetcher will load
1939
* from the start of the attribute to the number of components we
1940
* declare we need in c->vattr_sizes[].
1941
*
1942
* BGRA vertex attributes are a bit special: since we implement these
1943
* as RGBA swapping R/B components we always need at least 3 components
1944
* if component 0 is read.
1945
*/
1946
nir_foreach_shader_in_variable(var, c->s) {
1947
/* No VS attribute array support. */
1948
assert(MAX2(glsl_get_length(var->type), 1) == 1);
1949
1950
unsigned loc = var->data.driver_location;
1951
int start_component = var->data.location_frac;
1952
int num_components = glsl_get_components(var->type);
1953
1954
c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
1955
start_component + num_components);
1956
1957
/* Handle BGRA inputs */
1958
if (start_component == 0 &&
1959
c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
1960
c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
1961
}
1962
}
1963
1964
unsigned num_components = 0;
1965
uint32_t vpm_components_queued = 0;
1966
bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
1967
SYSTEM_VALUE_INSTANCE_ID) ||
1968
BITSET_TEST(c->s->info.system_values_read,
1969
SYSTEM_VALUE_INSTANCE_INDEX);
1970
bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
1971
SYSTEM_VALUE_BASE_INSTANCE);
1972
bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
1973
SYSTEM_VALUE_VERTEX_ID) ||
1974
BITSET_TEST(c->s->info.system_values_read,
1975
SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
1976
1977
num_components += uses_iid;
1978
num_components += uses_biid;
1979
num_components += uses_vid;
1980
1981
for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
1982
num_components += c->vattr_sizes[i];
1983
1984
if (uses_iid) {
1985
c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1986
&num_components, ~0);
1987
}
1988
1989
if (uses_biid) {
1990
c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
1991
&num_components, ~0);
1992
}
1993
1994
if (uses_vid) {
1995
c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1996
&num_components, ~0);
1997
}
1998
1999
/* The actual loads will happen directly in nir_intrinsic_load_input
2000
* on newer versions.
2001
*/
2002
if (c->devinfo->ver >= 40)
2003
return;
2004
2005
for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
2006
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2007
(loc + 1) * 4);
2008
2009
for (int i = 0; i < c->vattr_sizes[loc]; i++) {
2010
c->inputs[loc * 4 + i] =
2011
ntq_emit_vpm_read(c,
2012
&vpm_components_queued,
2013
&num_components,
2014
loc * 4 + i);
2015
2016
}
2017
}
2018
2019
if (c->devinfo->ver >= 40) {
2020
assert(vpm_components_queued == num_components);
2021
} else {
2022
assert(vpm_components_queued == 0);
2023
assert(num_components == 0);
2024
}
2025
}
2026
2027
static bool
2028
program_reads_point_coord(struct v3d_compile *c)
2029
{
2030
nir_foreach_shader_in_variable(var, c->s) {
2031
if (util_varying_is_point_coord(var->data.location,
2032
c->fs_key->point_sprite_mask)) {
2033
return true;
2034
}
2035
}
2036
2037
return false;
2038
}
2039
2040
static void
2041
ntq_setup_gs_inputs(struct v3d_compile *c)
2042
{
2043
nir_sort_variables_with_modes(c->s, driver_location_compare,
2044
nir_var_shader_in);
2045
2046
nir_foreach_shader_in_variable(var, c->s) {
2047
/* All GS inputs are arrays with as many entries as vertices
2048
* in the input primitive, but here we only care about the
2049
* per-vertex input type.
2050
*/
2051
assert(glsl_type_is_array(var->type));
2052
const struct glsl_type *type = glsl_get_array_element(var->type);
2053
unsigned array_len = MAX2(glsl_get_length(type), 1);
2054
unsigned loc = var->data.driver_location;
2055
2056
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2057
(loc + array_len) * 4);
2058
2059
if (var->data.compact) {
2060
for (unsigned j = 0; j < array_len; j++) {
2061
unsigned input_idx = c->num_inputs++;
2062
unsigned loc_frac = var->data.location_frac + j;
2063
unsigned loc = var->data.location + loc_frac / 4;
2064
unsigned comp = loc_frac % 4;
2065
c->input_slots[input_idx] =
2066
v3d_slot_from_slot_and_component(loc, comp);
2067
}
2068
continue;
2069
}
2070
2071
for (unsigned j = 0; j < array_len; j++) {
2072
unsigned num_elements = glsl_get_vector_elements(type);
2073
for (unsigned k = 0; k < num_elements; k++) {
2074
unsigned chan = var->data.location_frac + k;
2075
unsigned input_idx = c->num_inputs++;
2076
struct v3d_varying_slot slot =
2077
v3d_slot_from_slot_and_component(var->data.location + j, chan);
2078
c->input_slots[input_idx] = slot;
2079
}
2080
}
2081
}
2082
}
2083
2084
2085
static void
2086
ntq_setup_fs_inputs(struct v3d_compile *c)
2087
{
2088
nir_sort_variables_with_modes(c->s, driver_location_compare,
2089
nir_var_shader_in);
2090
2091
nir_foreach_shader_in_variable(var, c->s) {
2092
unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2093
unsigned loc = var->data.driver_location;
2094
2095
uint32_t inputs_array_size = c->inputs_array_size;
2096
uint32_t inputs_array_required_size = (loc + var_len) * 4;
2097
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2098
inputs_array_required_size);
2099
resize_interp_array(c, &c->interp, &inputs_array_size,
2100
inputs_array_required_size);
2101
2102
if (var->data.location == VARYING_SLOT_POS) {
2103
emit_fragcoord_input(c, loc);
2104
} else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2105
!c->fs_key->has_gs) {
2106
/* If the fragment shader reads gl_PrimitiveID and we
2107
* don't have a geometry shader in the pipeline to write
2108
* it then we program the hardware to inject it as
2109
* an implicit varying. Take it from there.
2110
*/
2111
c->inputs[loc * 4] = c->primitive_id;
2112
} else if (util_varying_is_point_coord(var->data.location,
2113
c->fs_key->point_sprite_mask)) {
2114
c->inputs[loc * 4 + 0] = c->point_x;
2115
c->inputs[loc * 4 + 1] = c->point_y;
2116
} else if (var->data.compact) {
2117
for (int j = 0; j < var_len; j++)
2118
emit_compact_fragment_input(c, loc, var, j);
2119
} else if (glsl_type_is_struct(var->type)) {
2120
for (int j = 0; j < var_len; j++) {
2121
emit_fragment_input(c, loc, var, j, 4);
2122
}
2123
} else {
2124
for (int j = 0; j < var_len; j++) {
2125
emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2126
}
2127
}
2128
}
2129
}
2130
2131
static void
2132
ntq_setup_outputs(struct v3d_compile *c)
2133
{
2134
if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2135
return;
2136
2137
nir_foreach_shader_out_variable(var, c->s) {
2138
unsigned array_len = MAX2(glsl_get_length(var->type), 1);
2139
unsigned loc = var->data.driver_location * 4;
2140
2141
assert(array_len == 1);
2142
(void)array_len;
2143
2144
for (int i = 0; i < 4 - var->data.location_frac; i++) {
2145
add_output(c, loc + var->data.location_frac + i,
2146
var->data.location,
2147
var->data.location_frac + i);
2148
}
2149
2150
switch (var->data.location) {
2151
case FRAG_RESULT_COLOR:
2152
c->output_color_var[0] = var;
2153
c->output_color_var[1] = var;
2154
c->output_color_var[2] = var;
2155
c->output_color_var[3] = var;
2156
break;
2157
case FRAG_RESULT_DATA0:
2158
case FRAG_RESULT_DATA1:
2159
case FRAG_RESULT_DATA2:
2160
case FRAG_RESULT_DATA3:
2161
c->output_color_var[var->data.location -
2162
FRAG_RESULT_DATA0] = var;
2163
break;
2164
case FRAG_RESULT_DEPTH:
2165
c->output_position_index = loc;
2166
break;
2167
case FRAG_RESULT_SAMPLE_MASK:
2168
c->output_sample_mask_index = loc;
2169
break;
2170
}
2171
}
2172
}
2173
2174
/**
2175
* Sets up the mapping from nir_register to struct qreg *.
2176
*
2177
* Each nir_register gets a struct qreg per 32-bit component being stored.
2178
*/
2179
static void
2180
ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
2181
{
2182
foreach_list_typed(nir_register, nir_reg, node, list) {
2183
unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
2184
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2185
array_len *
2186
nir_reg->num_components);
2187
2188
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2189
2190
for (int i = 0; i < array_len * nir_reg->num_components; i++)
2191
qregs[i] = vir_get_temp(c);
2192
}
2193
}
2194
2195
static void
2196
ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2197
{
2198
/* XXX perf: Experiment with using immediate loads to avoid having
2199
* these end up in the uniform stream. Watch out for breaking the
2200
* small immediates optimization in the process!
2201
*/
2202
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2203
for (int i = 0; i < instr->def.num_components; i++)
2204
qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2205
2206
_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2207
}
2208
2209
static void
2210
ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2211
{
2212
unsigned image_index = nir_src_as_uint(instr->src[0]);
2213
bool is_array = nir_intrinsic_image_array(instr);
2214
2215
assert(nir_src_as_uint(instr->src[1]) == 0);
2216
2217
ntq_store_dest(c, &instr->dest, 0,
2218
vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2219
if (instr->num_components > 1) {
2220
ntq_store_dest(c, &instr->dest, 1,
2221
vir_uniform(c,
2222
instr->num_components == 2 && is_array ?
2223
QUNIFORM_IMAGE_ARRAY_SIZE :
2224
QUNIFORM_IMAGE_HEIGHT,
2225
image_index));
2226
}
2227
if (instr->num_components > 2) {
2228
ntq_store_dest(c, &instr->dest, 2,
2229
vir_uniform(c,
2230
is_array ?
2231
QUNIFORM_IMAGE_ARRAY_SIZE :
2232
QUNIFORM_IMAGE_DEPTH,
2233
image_index));
2234
}
2235
}
2236
2237
static void
2238
vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2239
{
2240
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2241
2242
int rt = nir_src_as_uint(instr->src[0]);
2243
assert(rt < V3D_MAX_DRAW_BUFFERS);
2244
2245
int sample_index = nir_intrinsic_base(instr) ;
2246
assert(sample_index < V3D_MAX_SAMPLES);
2247
2248
int component = nir_intrinsic_component(instr);
2249
assert(component < 4);
2250
2251
/* We need to emit our TLB reads after we have acquired the scoreboard
2252
* lock, or the GPU will hang. Usually, we do our scoreboard locking on
2253
* the last thread switch to improve parallelism, however, that is only
2254
* guaranteed to happen before the tlb color writes.
2255
*
2256
* To fix that, we make sure we always emit a thread switch before the
2257
* first tlb color read. If that happens to be the last thread switch
2258
* we emit, then everything is fine, but otherwsie, if any code after
2259
* this point needs to emit additional thread switches, then we will
2260
* switch the strategy to locking the scoreboard on the first thread
2261
* switch instead -- see vir_emit_thrsw().
2262
*/
2263
if (!c->emitted_tlb_load) {
2264
if (!c->last_thrsw_at_top_level) {
2265
assert(c->devinfo->ver >= 41);
2266
vir_emit_thrsw(c);
2267
}
2268
2269
c->emitted_tlb_load = true;
2270
}
2271
2272
struct qreg *color_reads_for_sample =
2273
&c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2274
2275
if (color_reads_for_sample[component].file == QFILE_NULL) {
2276
enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2277
int num_components =
2278
util_format_get_nr_components(rt_format);
2279
2280
const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2281
if (swap_rb)
2282
num_components = MAX2(num_components, 3);
2283
2284
nir_variable *var = c->output_color_var[rt];
2285
enum glsl_base_type type = glsl_get_base_type(var->type);
2286
2287
bool is_int_format = type == GLSL_TYPE_INT ||
2288
type == GLSL_TYPE_UINT;
2289
2290
bool is_32b_tlb_format = is_int_format ||
2291
(c->fs_key->f32_color_rb & (1 << rt));
2292
2293
int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2294
2295
uint32_t conf = 0xffffff00;
2296
conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2297
TLB_SAMPLE_MODE_PER_PIXEL;
2298
conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2299
2300
if (is_32b_tlb_format) {
2301
/* The F32 vs I32 distinction was dropped in 4.2. */
2302
conf |= (c->devinfo->ver < 42 && is_int_format) ?
2303
TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2304
2305
conf |= ((num_components - 1) <<
2306
TLB_VEC_SIZE_MINUS_1_SHIFT);
2307
} else {
2308
conf |= TLB_TYPE_F16_COLOR;
2309
conf |= TLB_F16_SWAP_HI_LO;
2310
2311
if (num_components >= 3)
2312
conf |= TLB_VEC_SIZE_4_F16;
2313
else
2314
conf |= TLB_VEC_SIZE_2_F16;
2315
}
2316
2317
2318
for (int i = 0; i < num_samples; i++) {
2319
struct qreg r, g, b, a;
2320
if (is_32b_tlb_format) {
2321
r = conf != 0xffffffff && i == 0?
2322
vir_TLBU_COLOR_READ(c, conf) :
2323
vir_TLB_COLOR_READ(c);
2324
if (num_components >= 2)
2325
g = vir_TLB_COLOR_READ(c);
2326
if (num_components >= 3)
2327
b = vir_TLB_COLOR_READ(c);
2328
if (num_components >= 4)
2329
a = vir_TLB_COLOR_READ(c);
2330
} else {
2331
struct qreg rg = conf != 0xffffffff && i == 0 ?
2332
vir_TLBU_COLOR_READ(c, conf) :
2333
vir_TLB_COLOR_READ(c);
2334
r = vir_FMOV(c, rg);
2335
vir_set_unpack(c->defs[r.index], 0,
2336
V3D_QPU_UNPACK_L);
2337
g = vir_FMOV(c, rg);
2338
vir_set_unpack(c->defs[g.index], 0,
2339
V3D_QPU_UNPACK_H);
2340
2341
if (num_components > 2) {
2342
struct qreg ba = vir_TLB_COLOR_READ(c);
2343
b = vir_FMOV(c, ba);
2344
vir_set_unpack(c->defs[b.index], 0,
2345
V3D_QPU_UNPACK_L);
2346
a = vir_FMOV(c, ba);
2347
vir_set_unpack(c->defs[a.index], 0,
2348
V3D_QPU_UNPACK_H);
2349
}
2350
}
2351
2352
struct qreg *color_reads =
2353
&c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2354
2355
color_reads[0] = swap_rb ? b : r;
2356
if (num_components >= 2)
2357
color_reads[1] = g;
2358
if (num_components >= 3)
2359
color_reads[2] = swap_rb ? r : b;
2360
if (num_components >= 4)
2361
color_reads[3] = a;
2362
}
2363
}
2364
2365
assert(color_reads_for_sample[component].file != QFILE_NULL);
2366
ntq_store_dest(c, &instr->dest, 0,
2367
vir_MOV(c, color_reads_for_sample[component]));
2368
}
2369
2370
static void
2371
ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2372
{
2373
if (nir_src_is_const(instr->src[0])) {
2374
int offset = (nir_intrinsic_base(instr) +
2375
nir_src_as_uint(instr->src[0]));
2376
assert(offset % 4 == 0);
2377
/* We need dwords */
2378
offset = offset / 4;
2379
for (int i = 0; i < instr->num_components; i++) {
2380
ntq_store_dest(c, &instr->dest, i,
2381
vir_uniform(c, QUNIFORM_UNIFORM,
2382
offset + i));
2383
}
2384
} else {
2385
ntq_emit_tmu_general(c, instr, false);
2386
}
2387
}
2388
2389
static void
2390
ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2391
{
2392
/* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2393
*
2394
* Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2395
* if we don't support non-uniform offsets because we also set the
2396
* lower_all_io_to_temps option in the NIR compiler. This ensures that
2397
* any indirect indexing on in/out variables is turned into indirect
2398
* indexing on temporary variables instead, that we handle by lowering
2399
* to scratch. If we implement non-uniform offset here we might be able
2400
* to avoid the temp and scratch lowering, which involves copying from
2401
* the input to the temp variable, possibly making code more optimal.
2402
*/
2403
unsigned offset =
2404
nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2405
2406
if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
2407
/* Emit the LDVPM directly now, rather than at the top
2408
* of the shader like we did for V3D 3.x (which needs
2409
* vpmsetup when not just taking the next offset).
2410
*
2411
* Note that delaying like this may introduce stalls,
2412
* as LDVPMV takes a minimum of 1 instruction but may
2413
* be slower if the VPM unit is busy with another QPU.
2414
*/
2415
int index = 0;
2416
if (BITSET_TEST(c->s->info.system_values_read,
2417
SYSTEM_VALUE_INSTANCE_ID)) {
2418
index++;
2419
}
2420
if (BITSET_TEST(c->s->info.system_values_read,
2421
SYSTEM_VALUE_BASE_INSTANCE)) {
2422
index++;
2423
}
2424
if (BITSET_TEST(c->s->info.system_values_read,
2425
SYSTEM_VALUE_VERTEX_ID)) {
2426
index++;
2427
}
2428
for (int i = 0; i < offset; i++)
2429
index += c->vattr_sizes[i];
2430
index += nir_intrinsic_component(instr);
2431
for (int i = 0; i < instr->num_components; i++) {
2432
struct qreg vpm_offset = vir_uniform_ui(c, index++);
2433
ntq_store_dest(c, &instr->dest, i,
2434
vir_LDVPMV_IN(c, vpm_offset));
2435
}
2436
} else {
2437
for (int i = 0; i < instr->num_components; i++) {
2438
int comp = nir_intrinsic_component(instr) + i;
2439
ntq_store_dest(c, &instr->dest, i,
2440
vir_MOV(c, c->inputs[offset * 4 + comp]));
2441
}
2442
}
2443
}
2444
2445
static void
2446
ntq_emit_per_sample_color_write(struct v3d_compile *c,
2447
nir_intrinsic_instr *instr)
2448
{
2449
assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2450
2451
unsigned rt = nir_src_as_uint(instr->src[1]);
2452
assert(rt < V3D_MAX_DRAW_BUFFERS);
2453
2454
unsigned sample_idx = nir_intrinsic_base(instr);
2455
assert(sample_idx < V3D_MAX_SAMPLES);
2456
2457
unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2458
for (int i = 0; i < instr->num_components; i++) {
2459
c->sample_colors[offset + i] =
2460
vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2461
}
2462
}
2463
2464
static void
2465
ntq_emit_color_write(struct v3d_compile *c,
2466
nir_intrinsic_instr *instr)
2467
{
2468
unsigned offset = (nir_intrinsic_base(instr) +
2469
nir_src_as_uint(instr->src[1])) * 4 +
2470
nir_intrinsic_component(instr);
2471
for (int i = 0; i < instr->num_components; i++) {
2472
c->outputs[offset + i] =
2473
vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2474
}
2475
}
2476
2477
static void
2478
emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2479
{
2480
assert(instr->num_components == 1);
2481
2482
struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2483
2484
uint32_t base_offset = nir_intrinsic_base(instr);
2485
2486
if (base_offset)
2487
offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2488
2489
/* Usually, for VS or FS, we only emit outputs once at program end so
2490
* our VPM writes are never in non-uniform control flow, but this
2491
* is not true for GS, where we are emitting multiple vertices.
2492
*/
2493
if (vir_in_nonuniform_control_flow(c)) {
2494
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2495
V3D_QPU_PF_PUSHZ);
2496
}
2497
2498
struct qreg val = ntq_get_src(c, instr->src[0], 0);
2499
2500
/* The offset isn’t necessarily dynamically uniform for a geometry
2501
* shader. This can happen if the shader sometimes doesn’t emit one of
2502
* the vertices. In that case subsequent vertices will be written to
2503
* different offsets in the VPM and we need to use the scatter write
2504
* instruction to have a different offset for each lane.
2505
*/
2506
bool is_uniform_offset =
2507
!vir_in_nonuniform_control_flow(c) &&
2508
!nir_src_is_divergent(instr->src[1]);
2509
vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2510
2511
if (vir_in_nonuniform_control_flow(c)) {
2512
struct qinst *last_inst =
2513
(struct qinst *)c->cur_block->instructions.prev;
2514
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2515
}
2516
}
2517
2518
static void
2519
emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2520
{
2521
assert(c->s->info.stage == MESA_SHADER_VERTEX);
2522
assert(instr->num_components == 1);
2523
2524
uint32_t base = nir_intrinsic_base(instr);
2525
struct qreg val = ntq_get_src(c, instr->src[0], 0);
2526
2527
if (nir_src_is_const(instr->src[1])) {
2528
vir_VPM_WRITE(c, val,
2529
base + nir_src_as_uint(instr->src[1]));
2530
} else {
2531
struct qreg offset = vir_ADD(c,
2532
ntq_get_src(c, instr->src[1], 1),
2533
vir_uniform_ui(c, base));
2534
bool is_uniform_offset =
2535
!vir_in_nonuniform_control_flow(c) &&
2536
!nir_src_is_divergent(instr->src[1]);
2537
vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2538
}
2539
}
2540
2541
static void
2542
ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2543
{
2544
if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2545
ntq_emit_color_write(c, instr);
2546
else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2547
emit_store_output_gs(c, instr);
2548
else
2549
emit_store_output_vs(c, instr);
2550
}
2551
2552
/**
2553
* This implementation is based on v3d_sample_{x,y}_offset() from
2554
* v3d_sample_offset.h.
2555
*/
2556
static void
2557
ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2558
struct qreg *sx, struct qreg *sy)
2559
{
2560
sample_idx = vir_ITOF(c, sample_idx);
2561
2562
struct qreg offset_x =
2563
vir_FADD(c, vir_uniform_f(c, -0.125f),
2564
vir_FMUL(c, sample_idx,
2565
vir_uniform_f(c, 0.5f)));
2566
vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2567
vir_uniform_f(c, 2.0f), sample_idx),
2568
V3D_QPU_PF_PUSHC);
2569
offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2570
vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2571
offset_x);
2572
2573
struct qreg offset_y =
2574
vir_FADD(c, vir_uniform_f(c, -0.375f),
2575
vir_FMUL(c, sample_idx,
2576
vir_uniform_f(c, 0.25f)));
2577
*sx = offset_x;
2578
*sy = offset_y;
2579
}
2580
2581
/**
2582
* This implementation is based on get_centroid_offset() from fep.c.
2583
*/
2584
static void
2585
ntq_get_barycentric_centroid(struct v3d_compile *c,
2586
struct qreg *out_x,
2587
struct qreg *out_y)
2588
{
2589
struct qreg sample_mask;
2590
if (c->output_sample_mask_index != -1)
2591
sample_mask = c->outputs[c->output_sample_mask_index];
2592
else
2593
sample_mask = vir_MSF(c);
2594
2595
struct qreg i0 = vir_uniform_ui(c, 0);
2596
struct qreg i1 = vir_uniform_ui(c, 1);
2597
struct qreg i2 = vir_uniform_ui(c, 2);
2598
struct qreg i3 = vir_uniform_ui(c, 3);
2599
struct qreg i4 = vir_uniform_ui(c, 4);
2600
struct qreg i8 = vir_uniform_ui(c, 8);
2601
2602
/* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2603
struct qreg F = vir_uniform_ui(c, 0);
2604
struct qreg T = vir_uniform_ui(c, ~0);
2605
struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
2606
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2607
s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2608
struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
2609
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2610
s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2611
struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
2612
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2613
s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2614
struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
2615
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2616
s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2617
2618
/* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2619
struct qreg sample_idx = i3;
2620
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2621
sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2622
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2623
sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2624
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2625
sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2626
2627
/* Get offset at selected sample index */
2628
struct qreg offset_x, offset_y;
2629
ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2630
2631
/* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2632
* are selected.
2633
*/
2634
struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2635
struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2636
2637
struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2638
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2639
use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2640
use_center = vir_OR(c, use_center, s0_and_s3);
2641
use_center = vir_OR(c, use_center, s1_and_s2);
2642
2643
struct qreg zero = vir_uniform_f(c, 0.0f);
2644
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2645
offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
2646
offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
2647
2648
*out_x = offset_x;
2649
*out_y = offset_y;
2650
}
2651
2652
static struct qreg
2653
ntq_emit_load_interpolated_input(struct v3d_compile *c,
2654
struct qreg p,
2655
struct qreg C,
2656
struct qreg offset_x,
2657
struct qreg offset_y,
2658
unsigned mode)
2659
{
2660
if (mode == INTERP_MODE_FLAT)
2661
return C;
2662
2663
struct qreg sample_offset_x =
2664
vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2665
struct qreg sample_offset_y =
2666
vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2667
2668
struct qreg scaleX =
2669
vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
2670
offset_x);
2671
struct qreg scaleY =
2672
vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
2673
offset_y);
2674
2675
struct qreg pInterp =
2676
vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
2677
vir_FMUL(c, vir_FDY(c, p), scaleY)));
2678
2679
if (mode == INTERP_MODE_NOPERSPECTIVE)
2680
return vir_FADD(c, pInterp, C);
2681
2682
struct qreg w = c->payload_w;
2683
struct qreg wInterp =
2684
vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
2685
vir_FMUL(c, vir_FDY(c, w), scaleY)));
2686
2687
return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
2688
}
2689
2690
static void
2691
emit_ldunifa(struct v3d_compile *c, struct qreg *result)
2692
{
2693
struct qinst *ldunifa =
2694
vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
2695
ldunifa->qpu.sig.ldunifa = true;
2696
if (result)
2697
*result = vir_emit_def(c, ldunifa);
2698
else
2699
vir_emit_nondef(c, ldunifa);
2700
c->current_unifa_offset += 4;
2701
}
2702
2703
static void
2704
ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
2705
{
2706
/* Every ldunifa auto-increments the unifa address by 4 bytes, so our
2707
* current unifa offset is 4 bytes ahead of the offset of the last load.
2708
*/
2709
static const int32_t max_unifa_skip_dist =
2710
MAX_UNIFA_SKIP_DISTANCE - 4;
2711
2712
bool dynamic_src = !nir_src_is_const(instr->src[1]);
2713
uint32_t const_offset =
2714
dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
2715
2716
/* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
2717
* shifted up by 1 (0 is gallium's constant buffer 0).
2718
*/
2719
uint32_t index = nir_src_as_uint(instr->src[0]);
2720
if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
2721
index++;
2722
2723
/* We can only keep track of the last unifa address we used with
2724
* constant offset loads. If the new load targets the same UBO and
2725
* is close enough to the previous load, we can skip the unifa register
2726
* write by emitting dummy ldunifa instructions to update the unifa
2727
* address.
2728
*/
2729
bool skip_unifa = false;
2730
uint32_t ldunifa_skips = 0;
2731
if (dynamic_src) {
2732
c->current_unifa_block = NULL;
2733
} else if (c->cur_block == c->current_unifa_block &&
2734
c->current_unifa_index == index &&
2735
c->current_unifa_offset <= const_offset &&
2736
c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
2737
skip_unifa = true;
2738
ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
2739
} else {
2740
c->current_unifa_block = c->cur_block;
2741
c->current_unifa_index = index;
2742
c->current_unifa_offset = const_offset;
2743
}
2744
2745
if (!skip_unifa) {
2746
struct qreg base_offset =
2747
vir_uniform(c, QUNIFORM_UBO_ADDR,
2748
v3d_unit_data_create(index, const_offset));
2749
2750
struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
2751
if (!dynamic_src) {
2752
vir_MOV_dest(c, unifa, base_offset);
2753
} else {
2754
vir_ADD_dest(c, unifa, base_offset,
2755
ntq_get_src(c, instr->src[1], 0));
2756
}
2757
} else {
2758
for (int i = 0; i < ldunifa_skips; i++)
2759
emit_ldunifa(c, NULL);
2760
}
2761
2762
for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2763
struct qreg data;
2764
emit_ldunifa(c, &data);
2765
ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
2766
}
2767
}
2768
2769
static inline struct qreg
2770
emit_load_local_invocation_index(struct v3d_compile *c)
2771
{
2772
return vir_SHR(c, c->cs_payload[1],
2773
vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
2774
}
2775
2776
/* Various subgroup operations rely on the A flags, so this helper ensures that
2777
* A flags represents currently active lanes in the subgroup.
2778
*/
2779
static void
2780
set_a_flags_for_subgroup(struct v3d_compile *c)
2781
{
2782
/* MSF returns 0 for disabled lanes in compute shaders so
2783
* PUSHZ will set A=1 for disabled lanes. We want the inverse
2784
* of this but we don't have any means to negate the A flags
2785
* directly, but we can do it by repeating the same operation
2786
* with NORZ (A = ~A & ~Z).
2787
*/
2788
assert(c->s->info.stage == MESA_SHADER_COMPUTE);
2789
vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2790
vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
2791
2792
/* If we are under non-uniform control flow we also need to
2793
* AND the A flags with the current execute mask.
2794
*/
2795
if (vir_in_nonuniform_control_flow(c)) {
2796
const uint32_t bidx = c->cur_block->index;
2797
vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
2798
c->execute,
2799
vir_uniform_ui(c, bidx)),
2800
V3D_QPU_UF_ANDZ);
2801
}
2802
}
2803
2804
static void
2805
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
2806
{
2807
switch (instr->intrinsic) {
2808
case nir_intrinsic_load_uniform:
2809
ntq_emit_load_uniform(c, instr);
2810
break;
2811
2812
case nir_intrinsic_load_ubo:
2813
if (!nir_src_is_divergent(instr->src[1]))
2814
ntq_emit_load_ubo_unifa(c, instr);
2815
else
2816
ntq_emit_tmu_general(c, instr, false);
2817
break;
2818
2819
case nir_intrinsic_ssbo_atomic_add:
2820
case nir_intrinsic_ssbo_atomic_imin:
2821
case nir_intrinsic_ssbo_atomic_umin:
2822
case nir_intrinsic_ssbo_atomic_imax:
2823
case nir_intrinsic_ssbo_atomic_umax:
2824
case nir_intrinsic_ssbo_atomic_and:
2825
case nir_intrinsic_ssbo_atomic_or:
2826
case nir_intrinsic_ssbo_atomic_xor:
2827
case nir_intrinsic_ssbo_atomic_exchange:
2828
case nir_intrinsic_ssbo_atomic_comp_swap:
2829
case nir_intrinsic_load_ssbo:
2830
case nir_intrinsic_store_ssbo:
2831
ntq_emit_tmu_general(c, instr, false);
2832
break;
2833
2834
case nir_intrinsic_shared_atomic_add:
2835
case nir_intrinsic_shared_atomic_imin:
2836
case nir_intrinsic_shared_atomic_umin:
2837
case nir_intrinsic_shared_atomic_imax:
2838
case nir_intrinsic_shared_atomic_umax:
2839
case nir_intrinsic_shared_atomic_and:
2840
case nir_intrinsic_shared_atomic_or:
2841
case nir_intrinsic_shared_atomic_xor:
2842
case nir_intrinsic_shared_atomic_exchange:
2843
case nir_intrinsic_shared_atomic_comp_swap:
2844
case nir_intrinsic_load_shared:
2845
case nir_intrinsic_store_shared:
2846
case nir_intrinsic_load_scratch:
2847
case nir_intrinsic_store_scratch:
2848
ntq_emit_tmu_general(c, instr, true);
2849
break;
2850
2851
case nir_intrinsic_image_load:
2852
case nir_intrinsic_image_store:
2853
case nir_intrinsic_image_atomic_add:
2854
case nir_intrinsic_image_atomic_imin:
2855
case nir_intrinsic_image_atomic_umin:
2856
case nir_intrinsic_image_atomic_imax:
2857
case nir_intrinsic_image_atomic_umax:
2858
case nir_intrinsic_image_atomic_and:
2859
case nir_intrinsic_image_atomic_or:
2860
case nir_intrinsic_image_atomic_xor:
2861
case nir_intrinsic_image_atomic_exchange:
2862
case nir_intrinsic_image_atomic_comp_swap:
2863
v3d40_vir_emit_image_load_store(c, instr);
2864
break;
2865
2866
case nir_intrinsic_get_ssbo_size:
2867
ntq_store_dest(c, &instr->dest, 0,
2868
vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
2869
nir_src_comp_as_uint(instr->src[0], 0)));
2870
break;
2871
2872
case nir_intrinsic_get_ubo_size:
2873
ntq_store_dest(c, &instr->dest, 0,
2874
vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
2875
nir_src_comp_as_uint(instr->src[0], 0)));
2876
break;
2877
2878
case nir_intrinsic_load_user_clip_plane:
2879
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2880
ntq_store_dest(c, &instr->dest, i,
2881
vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
2882
nir_intrinsic_ucp_id(instr) *
2883
4 + i));
2884
}
2885
break;
2886
2887
case nir_intrinsic_load_viewport_x_scale:
2888
ntq_store_dest(c, &instr->dest, 0,
2889
vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
2890
break;
2891
2892
case nir_intrinsic_load_viewport_y_scale:
2893
ntq_store_dest(c, &instr->dest, 0,
2894
vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
2895
break;
2896
2897
case nir_intrinsic_load_viewport_z_scale:
2898
ntq_store_dest(c, &instr->dest, 0,
2899
vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
2900
break;
2901
2902
case nir_intrinsic_load_viewport_z_offset:
2903
ntq_store_dest(c, &instr->dest, 0,
2904
vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
2905
break;
2906
2907
case nir_intrinsic_load_line_coord:
2908
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
2909
break;
2910
2911
case nir_intrinsic_load_line_width:
2912
ntq_store_dest(c, &instr->dest, 0,
2913
vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
2914
break;
2915
2916
case nir_intrinsic_load_aa_line_width:
2917
ntq_store_dest(c, &instr->dest, 0,
2918
vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
2919
break;
2920
2921
case nir_intrinsic_load_sample_mask_in:
2922
ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
2923
break;
2924
2925
case nir_intrinsic_load_helper_invocation:
2926
vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2927
struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
2928
ntq_store_dest(c, &instr->dest, 0, qdest);
2929
break;
2930
2931
case nir_intrinsic_load_front_face:
2932
/* The register contains 0 (front) or 1 (back), and we need to
2933
* turn it into a NIR bool where true means front.
2934
*/
2935
ntq_store_dest(c, &instr->dest, 0,
2936
vir_ADD(c,
2937
vir_uniform_ui(c, -1),
2938
vir_REVF(c)));
2939
break;
2940
2941
case nir_intrinsic_load_base_instance:
2942
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
2943
break;
2944
2945
case nir_intrinsic_load_instance_id:
2946
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
2947
break;
2948
2949
case nir_intrinsic_load_vertex_id:
2950
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
2951
break;
2952
2953
case nir_intrinsic_load_tlb_color_v3d:
2954
vir_emit_tlb_color_read(c, instr);
2955
break;
2956
2957
case nir_intrinsic_load_input:
2958
ntq_emit_load_input(c, instr);
2959
break;
2960
2961
case nir_intrinsic_store_tlb_sample_color_v3d:
2962
ntq_emit_per_sample_color_write(c, instr);
2963
break;
2964
2965
case nir_intrinsic_store_output:
2966
ntq_emit_store_output(c, instr);
2967
break;
2968
2969
case nir_intrinsic_image_size:
2970
ntq_emit_image_size(c, instr);
2971
break;
2972
2973
case nir_intrinsic_discard:
2974
ntq_flush_tmu(c);
2975
2976
if (vir_in_nonuniform_control_flow(c)) {
2977
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2978
V3D_QPU_PF_PUSHZ);
2979
vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
2980
vir_uniform_ui(c, 0)),
2981
V3D_QPU_COND_IFA);
2982
} else {
2983
vir_SETMSF_dest(c, vir_nop_reg(),
2984
vir_uniform_ui(c, 0));
2985
}
2986
break;
2987
2988
case nir_intrinsic_discard_if: {
2989
ntq_flush_tmu(c);
2990
2991
enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
2992
2993
if (vir_in_nonuniform_control_flow(c)) {
2994
struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
2995
c->execute);
2996
if (cond == V3D_QPU_COND_IFA) {
2997
vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
2998
} else {
2999
vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3000
cond = V3D_QPU_COND_IFA;
3001
}
3002
}
3003
3004
vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3005
vir_uniform_ui(c, 0)), cond);
3006
3007
break;
3008
}
3009
3010
case nir_intrinsic_memory_barrier:
3011
case nir_intrinsic_memory_barrier_buffer:
3012
case nir_intrinsic_memory_barrier_image:
3013
case nir_intrinsic_memory_barrier_shared:
3014
case nir_intrinsic_memory_barrier_tcs_patch:
3015
case nir_intrinsic_group_memory_barrier:
3016
/* We don't do any instruction scheduling of these NIR
3017
* instructions between each other, so we just need to make
3018
* sure that the TMU operations before the barrier are flushed
3019
* before the ones after the barrier.
3020
*/
3021
ntq_flush_tmu(c);
3022
break;
3023
3024
case nir_intrinsic_control_barrier:
3025
/* Emit a TSY op to get all invocations in the workgroup
3026
* (actually supergroup) to block until the last invocation
3027
* reaches the TSY op.
3028
*/
3029
ntq_flush_tmu(c);
3030
3031
if (c->devinfo->ver >= 42) {
3032
vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3033
V3D_QPU_WADDR_SYNCB));
3034
} else {
3035
struct qinst *sync =
3036
vir_BARRIERID_dest(c,
3037
vir_reg(QFILE_MAGIC,
3038
V3D_QPU_WADDR_SYNCU));
3039
sync->uniform =
3040
vir_get_uniform_index(c, QUNIFORM_CONSTANT,
3041
0xffffff00 |
3042
V3D_TSY_WAIT_INC_CHECK);
3043
3044
}
3045
3046
/* The blocking of a TSY op only happens at the next thread
3047
* switch. No texturing may be outstanding at the time of a
3048
* TSY blocking operation.
3049
*/
3050
vir_emit_thrsw(c);
3051
break;
3052
3053
case nir_intrinsic_load_num_workgroups:
3054
for (int i = 0; i < 3; i++) {
3055
ntq_store_dest(c, &instr->dest, i,
3056
vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3057
i));
3058
}
3059
break;
3060
3061
case nir_intrinsic_load_workgroup_id: {
3062
struct qreg x = vir_AND(c, c->cs_payload[0],
3063
vir_uniform_ui(c, 0xffff));
3064
3065
struct qreg y = vir_SHR(c, c->cs_payload[0],
3066
vir_uniform_ui(c, 16));
3067
3068
struct qreg z = vir_AND(c, c->cs_payload[1],
3069
vir_uniform_ui(c, 0xffff));
3070
3071
/* We only support dispatch base in Vulkan */
3072
if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
3073
x = vir_ADD(c, x,
3074
vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
3075
y = vir_ADD(c, y,
3076
vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
3077
z = vir_ADD(c, z,
3078
vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
3079
}
3080
3081
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
3082
ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
3083
ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
3084
break;
3085
}
3086
3087
case nir_intrinsic_load_local_invocation_index:
3088
ntq_store_dest(c, &instr->dest, 0,
3089
emit_load_local_invocation_index(c));
3090
break;
3091
3092
case nir_intrinsic_load_subgroup_id: {
3093
/* This is basically the batch index, which is the Local
3094
* Invocation Index divided by the SIMD width).
3095
*/
3096
STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
3097
const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3098
struct qreg lii = emit_load_local_invocation_index(c);
3099
ntq_store_dest(c, &instr->dest, 0,
3100
vir_SHR(c, lii,
3101
vir_uniform_ui(c, divide_shift)));
3102
break;
3103
}
3104
3105
case nir_intrinsic_load_per_vertex_input: {
3106
/* The vertex shader writes all its used outputs into
3107
* consecutive VPM offsets, so if any output component is
3108
* unused, its VPM offset is used by the next used
3109
* component. This means that we can't assume that each
3110
* location will use 4 consecutive scalar offsets in the VPM
3111
* and we need to compute the VPM offset for each input by
3112
* going through the inputs and finding the one that matches
3113
* our location and component.
3114
*
3115
* col: vertex index, row = varying index
3116
*/
3117
assert(nir_src_is_const(instr->src[1]));
3118
uint32_t location =
3119
nir_intrinsic_io_semantics(instr).location +
3120
nir_src_as_uint(instr->src[1]);
3121
uint32_t component = nir_intrinsic_component(instr);
3122
3123
int32_t row_idx = -1;
3124
for (int i = 0; i < c->num_inputs; i++) {
3125
struct v3d_varying_slot slot = c->input_slots[i];
3126
if (v3d_slot_get_slot(slot) == location &&
3127
v3d_slot_get_component(slot) == component) {
3128
row_idx = i;
3129
break;
3130
}
3131
}
3132
3133
assert(row_idx != -1);
3134
3135
struct qreg col = ntq_get_src(c, instr->src[0], 0);
3136
for (int i = 0; i < instr->num_components; i++) {
3137
struct qreg row = vir_uniform_ui(c, row_idx++);
3138
ntq_store_dest(c, &instr->dest, i,
3139
vir_LDVPMG_IN(c, row, col));
3140
}
3141
break;
3142
}
3143
3144
case nir_intrinsic_emit_vertex:
3145
case nir_intrinsic_end_primitive:
3146
unreachable("Should have been lowered in v3d_nir_lower_io");
3147
break;
3148
3149
case nir_intrinsic_load_primitive_id: {
3150
/* gl_PrimitiveIdIn is written by the GBG in the first word of
3151
* VPM output header. According to docs, we should read this
3152
* using ldvpm(v,d)_in (See Table 71).
3153
*/
3154
assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3155
ntq_store_dest(c, &instr->dest, 0,
3156
vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3157
break;
3158
}
3159
3160
case nir_intrinsic_load_invocation_id:
3161
ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
3162
break;
3163
3164
case nir_intrinsic_load_fb_layers_v3d:
3165
ntq_store_dest(c, &instr->dest, 0,
3166
vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3167
break;
3168
3169
case nir_intrinsic_load_sample_id:
3170
ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
3171
break;
3172
3173
case nir_intrinsic_load_sample_pos:
3174
ntq_store_dest(c, &instr->dest, 0,
3175
vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3176
ntq_store_dest(c, &instr->dest, 1,
3177
vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3178
break;
3179
3180
case nir_intrinsic_load_barycentric_at_offset:
3181
ntq_store_dest(c, &instr->dest, 0,
3182
vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3183
ntq_store_dest(c, &instr->dest, 1,
3184
vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3185
break;
3186
3187
case nir_intrinsic_load_barycentric_pixel:
3188
ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3189
ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3190
break;
3191
3192
case nir_intrinsic_load_barycentric_at_sample: {
3193
if (!c->fs_key->msaa) {
3194
ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3195
ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3196
return;
3197
}
3198
3199
struct qreg offset_x, offset_y;
3200
struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3201
ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3202
3203
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3204
ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3205
break;
3206
}
3207
3208
case nir_intrinsic_load_barycentric_sample: {
3209
struct qreg offset_x =
3210
vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3211
struct qreg offset_y =
3212
vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3213
3214
ntq_store_dest(c, &instr->dest, 0,
3215
vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3216
ntq_store_dest(c, &instr->dest, 1,
3217
vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3218
break;
3219
}
3220
3221
case nir_intrinsic_load_barycentric_centroid: {
3222
struct qreg offset_x, offset_y;
3223
ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3224
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3225
ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3226
break;
3227
}
3228
3229
case nir_intrinsic_load_interpolated_input: {
3230
assert(nir_src_is_const(instr->src[1]));
3231
const uint32_t offset = nir_src_as_uint(instr->src[1]);
3232
3233
for (int i = 0; i < instr->num_components; i++) {
3234
const uint32_t input_idx =
3235
(nir_intrinsic_base(instr) + offset) * 4 +
3236
nir_intrinsic_component(instr) + i;
3237
3238
/* If we are not in MSAA or if we are not interpolating
3239
* a user varying, just return the pre-computed
3240
* interpolated input.
3241
*/
3242
if (!c->fs_key->msaa ||
3243
c->interp[input_idx].vp.file == QFILE_NULL) {
3244
ntq_store_dest(c, &instr->dest, i,
3245
vir_MOV(c, c->inputs[input_idx]));
3246
continue;
3247
}
3248
3249
/* Otherwise compute interpolation at the specified
3250
* offset.
3251
*/
3252
struct qreg p = c->interp[input_idx].vp;
3253
struct qreg C = c->interp[input_idx].C;
3254
unsigned interp_mode = c->interp[input_idx].mode;
3255
3256
struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3257
struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3258
3259
struct qreg result =
3260
ntq_emit_load_interpolated_input(c, p, C,
3261
offset_x, offset_y,
3262
interp_mode);
3263
ntq_store_dest(c, &instr->dest, i, result);
3264
}
3265
break;
3266
}
3267
3268
case nir_intrinsic_load_subgroup_size:
3269
ntq_store_dest(c, &instr->dest, 0,
3270
vir_uniform_ui(c, V3D_CHANNELS));
3271
break;
3272
3273
case nir_intrinsic_load_subgroup_invocation:
3274
ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
3275
break;
3276
3277
case nir_intrinsic_elect: {
3278
set_a_flags_for_subgroup(c);
3279
struct qreg first = vir_FLAFIRST(c);
3280
3281
/* Produce a boolean result from Flafirst */
3282
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3283
first, vir_uniform_ui(c, 1)),
3284
V3D_QPU_PF_PUSHZ);
3285
struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3286
ntq_store_dest(c, &instr->dest, 0, result);
3287
break;
3288
}
3289
3290
case nir_intrinsic_load_num_subgroups:
3291
unreachable("Should have been lowered");
3292
break;
3293
3294
default:
3295
fprintf(stderr, "Unknown intrinsic: ");
3296
nir_print_instr(&instr->instr, stderr);
3297
fprintf(stderr, "\n");
3298
break;
3299
}
3300
}
3301
3302
/* Clears (activates) the execute flags for any channels whose jump target
3303
* matches this block.
3304
*
3305
* XXX perf: Could we be using flpush/flpop somehow for our execution channel
3306
* enabling?
3307
*
3308
*/
3309
static void
3310
ntq_activate_execute_for_block(struct v3d_compile *c)
3311
{
3312
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3313
c->execute, vir_uniform_ui(c, c->cur_block->index)),
3314
V3D_QPU_PF_PUSHZ);
3315
3316
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3317
}
3318
3319
static void
3320
ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
3321
{
3322
nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3323
bool empty_else_block =
3324
(nir_else_block == nir_if_last_else_block(if_stmt) &&
3325
exec_list_is_empty(&nir_else_block->instr_list));
3326
3327
struct qblock *then_block = vir_new_block(c);
3328
struct qblock *after_block = vir_new_block(c);
3329
struct qblock *else_block;
3330
if (empty_else_block)
3331
else_block = after_block;
3332
else
3333
else_block = vir_new_block(c);
3334
3335
/* Check if this if statement is really just a conditional jump with
3336
* the form:
3337
*
3338
* if (cond) {
3339
* break/continue;
3340
* } else {
3341
* }
3342
*
3343
* In which case we can skip the jump to ELSE we emit before the THEN
3344
* block and instead just emit the break/continue directly.
3345
*/
3346
nir_jump_instr *conditional_jump = NULL;
3347
if (empty_else_block) {
3348
nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
3349
struct nir_instr *inst = nir_block_first_instr(nir_then_block);
3350
if (inst && inst->type == nir_instr_type_jump)
3351
conditional_jump = nir_instr_as_jump(inst);
3352
}
3353
3354
/* Set up the flags for the IF condition (taking the THEN branch). */
3355
enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3356
3357
if (!conditional_jump) {
3358
/* Jump to ELSE. */
3359
struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3360
V3D_QPU_BRANCH_COND_ANYNA :
3361
V3D_QPU_BRANCH_COND_ANYA);
3362
/* Pixels that were not dispatched or have been discarded
3363
* should not contribute to the ANYA/ANYNA condition.
3364
*/
3365
branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3366
3367
vir_link_blocks(c->cur_block, else_block);
3368
vir_link_blocks(c->cur_block, then_block);
3369
3370
/* Process the THEN block. */
3371
vir_set_emit_block(c, then_block);
3372
ntq_emit_cf_list(c, &if_stmt->then_list);
3373
3374
if (!empty_else_block) {
3375
/* At the end of the THEN block, jump to ENDIF, unless
3376
* the block ended in a break or continue.
3377
*/
3378
if (!c->cur_block->branch_emitted) {
3379
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3380
vir_link_blocks(c->cur_block, after_block);
3381
}
3382
3383
/* Emit the else block. */
3384
vir_set_emit_block(c, else_block);
3385
ntq_emit_cf_list(c, &if_stmt->else_list);
3386
}
3387
} else {
3388
/* Emit the conditional jump directly.
3389
*
3390
* Use ALL with breaks and ANY with continues to ensure that
3391
* we always break and never continue when all lanes have been
3392
* disabled (for example because of discards) to prevent
3393
* infinite loops.
3394
*/
3395
assert(conditional_jump &&
3396
(conditional_jump->type == nir_jump_continue ||
3397
conditional_jump->type == nir_jump_break));
3398
3399
struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3400
(conditional_jump->type == nir_jump_break ?
3401
V3D_QPU_BRANCH_COND_ALLA :
3402
V3D_QPU_BRANCH_COND_ANYA) :
3403
(conditional_jump->type == nir_jump_break ?
3404
V3D_QPU_BRANCH_COND_ALLNA :
3405
V3D_QPU_BRANCH_COND_ANYNA));
3406
branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3407
3408
vir_link_blocks(c->cur_block,
3409
conditional_jump->type == nir_jump_break ?
3410
c->loop_break_block :
3411
c->loop_cont_block);
3412
}
3413
3414
vir_link_blocks(c->cur_block, after_block);
3415
3416
vir_set_emit_block(c, after_block);
3417
}
3418
3419
static void
3420
ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
3421
{
3422
nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3423
bool empty_else_block =
3424
(nir_else_block == nir_if_last_else_block(if_stmt) &&
3425
exec_list_is_empty(&nir_else_block->instr_list));
3426
3427
struct qblock *then_block = vir_new_block(c);
3428
struct qblock *after_block = vir_new_block(c);
3429
struct qblock *else_block;
3430
if (empty_else_block)
3431
else_block = after_block;
3432
else
3433
else_block = vir_new_block(c);
3434
3435
bool was_uniform_control_flow = false;
3436
if (!vir_in_nonuniform_control_flow(c)) {
3437
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3438
was_uniform_control_flow = true;
3439
}
3440
3441
/* Set up the flags for the IF condition (taking the THEN branch). */
3442
enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3443
3444
/* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
3445
* was previously active (execute Z) for updating the exec flags.
3446
*/
3447
if (was_uniform_control_flow) {
3448
cond = v3d_qpu_cond_invert(cond);
3449
} else {
3450
struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
3451
if (cond == V3D_QPU_COND_IFA) {
3452
vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
3453
} else {
3454
vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
3455
cond = V3D_QPU_COND_IFA;
3456
}
3457
}
3458
3459
vir_MOV_cond(c, cond,
3460
c->execute,
3461
vir_uniform_ui(c, else_block->index));
3462
3463
/* Jump to ELSE if nothing is active for THEN, otherwise fall
3464
* through.
3465
*/
3466
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3467
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
3468
vir_link_blocks(c->cur_block, else_block);
3469
vir_link_blocks(c->cur_block, then_block);
3470
3471
/* Process the THEN block. */
3472
vir_set_emit_block(c, then_block);
3473
ntq_emit_cf_list(c, &if_stmt->then_list);
3474
3475
if (!empty_else_block) {
3476
/* Handle the end of the THEN block. First, all currently
3477
* active channels update their execute flags to point to
3478
* ENDIF
3479
*/
3480
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3481
V3D_QPU_PF_PUSHZ);
3482
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3483
vir_uniform_ui(c, after_block->index));
3484
3485
/* If everything points at ENDIF, then jump there immediately. */
3486
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3487
c->execute,
3488
vir_uniform_ui(c, after_block->index)),
3489
V3D_QPU_PF_PUSHZ);
3490
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
3491
vir_link_blocks(c->cur_block, after_block);
3492
vir_link_blocks(c->cur_block, else_block);
3493
3494
vir_set_emit_block(c, else_block);
3495
ntq_activate_execute_for_block(c);
3496
ntq_emit_cf_list(c, &if_stmt->else_list);
3497
}
3498
3499
vir_link_blocks(c->cur_block, after_block);
3500
3501
vir_set_emit_block(c, after_block);
3502
if (was_uniform_control_flow)
3503
c->execute = c->undef;
3504
else
3505
ntq_activate_execute_for_block(c);
3506
}
3507
3508
static void
3509
ntq_emit_if(struct v3d_compile *c, nir_if *nif)
3510
{
3511
bool was_in_control_flow = c->in_control_flow;
3512
c->in_control_flow = true;
3513
if (!vir_in_nonuniform_control_flow(c) &&
3514
!nir_src_is_divergent(nif->condition)) {
3515
ntq_emit_uniform_if(c, nif);
3516
} else {
3517
ntq_emit_nonuniform_if(c, nif);
3518
}
3519
c->in_control_flow = was_in_control_flow;
3520
}
3521
3522
static void
3523
ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
3524
{
3525
switch (jump->type) {
3526
case nir_jump_break:
3527
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3528
V3D_QPU_PF_PUSHZ);
3529
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3530
vir_uniform_ui(c, c->loop_break_block->index));
3531
break;
3532
3533
case nir_jump_continue:
3534
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3535
V3D_QPU_PF_PUSHZ);
3536
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3537
vir_uniform_ui(c, c->loop_cont_block->index));
3538
break;
3539
3540
case nir_jump_return:
3541
unreachable("All returns should be lowered\n");
3542
break;
3543
3544
case nir_jump_halt:
3545
case nir_jump_goto:
3546
case nir_jump_goto_if:
3547
unreachable("not supported\n");
3548
break;
3549
}
3550
}
3551
3552
static void
3553
ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
3554
{
3555
switch (jump->type) {
3556
case nir_jump_break:
3557
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3558
vir_link_blocks(c->cur_block, c->loop_break_block);
3559
c->cur_block->branch_emitted = true;
3560
break;
3561
case nir_jump_continue:
3562
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3563
vir_link_blocks(c->cur_block, c->loop_cont_block);
3564
c->cur_block->branch_emitted = true;
3565
break;
3566
3567
case nir_jump_return:
3568
unreachable("All returns should be lowered\n");
3569
break;
3570
3571
case nir_jump_halt:
3572
case nir_jump_goto:
3573
case nir_jump_goto_if:
3574
unreachable("not supported\n");
3575
break;
3576
}
3577
}
3578
3579
static void
3580
ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
3581
{
3582
switch (instr->type) {
3583
case nir_instr_type_alu:
3584
ntq_emit_alu(c, nir_instr_as_alu(instr));
3585
break;
3586
3587
case nir_instr_type_intrinsic:
3588
ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
3589
break;
3590
3591
case nir_instr_type_load_const:
3592
ntq_emit_load_const(c, nir_instr_as_load_const(instr));
3593
break;
3594
3595
case nir_instr_type_ssa_undef:
3596
unreachable("Should've been lowered by nir_lower_undef_to_zero");
3597
break;
3598
3599
case nir_instr_type_tex:
3600
ntq_emit_tex(c, nir_instr_as_tex(instr));
3601
break;
3602
3603
case nir_instr_type_jump:
3604
/* Always flush TMU before jumping to another block, for the
3605
* same reasons as in ntq_emit_block.
3606
*/
3607
ntq_flush_tmu(c);
3608
if (vir_in_nonuniform_control_flow(c))
3609
ntq_emit_jump(c, nir_instr_as_jump(instr));
3610
else
3611
ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
3612
break;
3613
3614
default:
3615
fprintf(stderr, "Unknown NIR instr type: ");
3616
nir_print_instr(instr, stderr);
3617
fprintf(stderr, "\n");
3618
abort();
3619
}
3620
}
3621
3622
static void
3623
ntq_emit_block(struct v3d_compile *c, nir_block *block)
3624
{
3625
nir_foreach_instr(instr, block) {
3626
ntq_emit_instr(c, instr);
3627
}
3628
3629
/* Always process pending TMU operations in the same block they were
3630
* emitted: we can't emit TMU operations in a block and then emit a
3631
* thread switch and LDTMU/TMUWT for them in another block, possibly
3632
* under control flow.
3633
*/
3634
ntq_flush_tmu(c);
3635
}
3636
3637
static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
3638
3639
static void
3640
ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
3641
{
3642
bool was_uniform_control_flow = false;
3643
if (!vir_in_nonuniform_control_flow(c)) {
3644
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3645
was_uniform_control_flow = true;
3646
}
3647
3648
c->loop_cont_block = vir_new_block(c);
3649
c->loop_break_block = vir_new_block(c);
3650
3651
vir_link_blocks(c->cur_block, c->loop_cont_block);
3652
vir_set_emit_block(c, c->loop_cont_block);
3653
ntq_activate_execute_for_block(c);
3654
3655
ntq_emit_cf_list(c, &loop->body);
3656
3657
/* Re-enable any previous continues now, so our ANYA check below
3658
* works.
3659
*
3660
* XXX: Use the .ORZ flags update, instead.
3661
*/
3662
vir_set_pf(c, vir_XOR_dest(c,
3663
vir_nop_reg(),
3664
c->execute,
3665
vir_uniform_ui(c, c->loop_cont_block->index)),
3666
V3D_QPU_PF_PUSHZ);
3667
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3668
3669
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3670
3671
struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
3672
/* Pixels that were not dispatched or have been discarded should not
3673
* contribute to looping again.
3674
*/
3675
branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3676
vir_link_blocks(c->cur_block, c->loop_cont_block);
3677
vir_link_blocks(c->cur_block, c->loop_break_block);
3678
3679
vir_set_emit_block(c, c->loop_break_block);
3680
if (was_uniform_control_flow)
3681
c->execute = c->undef;
3682
else
3683
ntq_activate_execute_for_block(c);
3684
}
3685
3686
static void
3687
ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
3688
{
3689
3690
c->loop_cont_block = vir_new_block(c);
3691
c->loop_break_block = vir_new_block(c);
3692
3693
vir_link_blocks(c->cur_block, c->loop_cont_block);
3694
vir_set_emit_block(c, c->loop_cont_block);
3695
3696
ntq_emit_cf_list(c, &loop->body);
3697
3698
if (!c->cur_block->branch_emitted) {
3699
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3700
vir_link_blocks(c->cur_block, c->loop_cont_block);
3701
}
3702
3703
vir_set_emit_block(c, c->loop_break_block);
3704
}
3705
3706
static void
3707
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
3708
{
3709
bool was_in_control_flow = c->in_control_flow;
3710
c->in_control_flow = true;
3711
3712
struct qblock *save_loop_cont_block = c->loop_cont_block;
3713
struct qblock *save_loop_break_block = c->loop_break_block;
3714
3715
if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
3716
ntq_emit_nonuniform_loop(c, loop);
3717
} else {
3718
ntq_emit_uniform_loop(c, loop);
3719
}
3720
3721
c->loop_break_block = save_loop_break_block;
3722
c->loop_cont_block = save_loop_cont_block;
3723
3724
c->loops++;
3725
3726
c->in_control_flow = was_in_control_flow;
3727
}
3728
3729
static void
3730
ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
3731
{
3732
fprintf(stderr, "FUNCTIONS not handled.\n");
3733
abort();
3734
}
3735
3736
static void
3737
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
3738
{
3739
foreach_list_typed(nir_cf_node, node, node, list) {
3740
switch (node->type) {
3741
case nir_cf_node_block:
3742
ntq_emit_block(c, nir_cf_node_as_block(node));
3743
break;
3744
3745
case nir_cf_node_if:
3746
ntq_emit_if(c, nir_cf_node_as_if(node));
3747
break;
3748
3749
case nir_cf_node_loop:
3750
ntq_emit_loop(c, nir_cf_node_as_loop(node));
3751
break;
3752
3753
case nir_cf_node_function:
3754
ntq_emit_function(c, nir_cf_node_as_function(node));
3755
break;
3756
3757
default:
3758
fprintf(stderr, "Unknown NIR node type\n");
3759
abort();
3760
}
3761
}
3762
}
3763
3764
static void
3765
ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
3766
{
3767
ntq_setup_registers(c, &impl->registers);
3768
ntq_emit_cf_list(c, &impl->body);
3769
}
3770
3771
static void
3772
nir_to_vir(struct v3d_compile *c)
3773
{
3774
switch (c->s->info.stage) {
3775
case MESA_SHADER_FRAGMENT:
3776
c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
3777
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
3778
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
3779
3780
/* V3D 4.x can disable implicit varyings if they are not used */
3781
c->fs_uses_primitive_id =
3782
nir_find_variable_with_location(c->s, nir_var_shader_in,
3783
VARYING_SLOT_PRIMITIVE_ID);
3784
if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
3785
c->primitive_id =
3786
emit_fragment_varying(c, NULL, -1, 0, 0);
3787
}
3788
3789
if (c->fs_key->is_points &&
3790
(c->devinfo->ver < 40 || program_reads_point_coord(c))) {
3791
c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3792
c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
3793
c->uses_implicit_point_line_varyings = true;
3794
} else if (c->fs_key->is_lines &&
3795
(c->devinfo->ver < 40 ||
3796
BITSET_TEST(c->s->info.system_values_read,
3797
SYSTEM_VALUE_LINE_COORD))) {
3798
c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3799
c->uses_implicit_point_line_varyings = true;
3800
}
3801
3802
c->force_per_sample_msaa =
3803
c->s->info.fs.uses_sample_qualifier ||
3804
BITSET_TEST(c->s->info.system_values_read,
3805
SYSTEM_VALUE_SAMPLE_ID) ||
3806
BITSET_TEST(c->s->info.system_values_read,
3807
SYSTEM_VALUE_SAMPLE_POS);
3808
break;
3809
case MESA_SHADER_COMPUTE:
3810
/* Set up the TSO for barriers, assuming we do some. */
3811
if (c->devinfo->ver < 42) {
3812
vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3813
V3D_QPU_WADDR_SYNC));
3814
}
3815
3816
c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
3817
c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
3818
3819
/* Set up the division between gl_LocalInvocationIndex and
3820
* wg_in_mem in the payload reg.
3821
*/
3822
int wg_size = (c->s->info.workgroup_size[0] *
3823
c->s->info.workgroup_size[1] *
3824
c->s->info.workgroup_size[2]);
3825
c->local_invocation_index_bits =
3826
ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
3827
assert(c->local_invocation_index_bits <= 8);
3828
3829
if (c->s->info.shared_size) {
3830
struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
3831
vir_uniform_ui(c, 16));
3832
if (c->s->info.workgroup_size[0] != 1 ||
3833
c->s->info.workgroup_size[1] != 1 ||
3834
c->s->info.workgroup_size[2] != 1) {
3835
int wg_bits = (16 -
3836
c->local_invocation_index_bits);
3837
int wg_mask = (1 << wg_bits) - 1;
3838
wg_in_mem = vir_AND(c, wg_in_mem,
3839
vir_uniform_ui(c, wg_mask));
3840
}
3841
struct qreg shared_per_wg =
3842
vir_uniform_ui(c, c->s->info.shared_size);
3843
3844
c->cs_shared_offset =
3845
vir_ADD(c,
3846
vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
3847
vir_UMUL(c, wg_in_mem, shared_per_wg));
3848
}
3849
break;
3850
default:
3851
break;
3852
}
3853
3854
if (c->s->scratch_size) {
3855
v3d_setup_spill_base(c);
3856
c->spill_size += V3D_CHANNELS * c->s->scratch_size;
3857
}
3858
3859
switch (c->s->info.stage) {
3860
case MESA_SHADER_VERTEX:
3861
ntq_setup_vs_inputs(c);
3862
break;
3863
case MESA_SHADER_GEOMETRY:
3864
ntq_setup_gs_inputs(c);
3865
break;
3866
case MESA_SHADER_FRAGMENT:
3867
ntq_setup_fs_inputs(c);
3868
break;
3869
case MESA_SHADER_COMPUTE:
3870
break;
3871
default:
3872
unreachable("unsupported shader stage");
3873
}
3874
3875
ntq_setup_outputs(c);
3876
3877
/* Find the main function and emit the body. */
3878
nir_foreach_function(function, c->s) {
3879
assert(strcmp(function->name, "main") == 0);
3880
assert(function->impl);
3881
ntq_emit_impl(c, function->impl);
3882
}
3883
}
3884
3885
/**
3886
* When demoting a shader down to single-threaded, removes the THRSW
3887
* instructions (one will still be inserted at v3d_vir_to_qpu() for the
3888
* program end).
3889
*/
3890
static void
3891
vir_remove_thrsw(struct v3d_compile *c)
3892
{
3893
vir_for_each_block(block, c) {
3894
vir_for_each_inst_safe(inst, block) {
3895
if (inst->qpu.sig.thrsw)
3896
vir_remove_instruction(c, inst);
3897
}
3898
}
3899
3900
c->last_thrsw = NULL;
3901
}
3902
3903
void
3904
vir_emit_last_thrsw(struct v3d_compile *c)
3905
{
3906
/* On V3D before 4.1, we need a TMU op to be outstanding when thread
3907
* switching, so disable threads if we didn't do any TMU ops (each of
3908
* which would have emitted a THRSW).
3909
*/
3910
if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
3911
c->threads = 1;
3912
if (c->last_thrsw)
3913
vir_remove_thrsw(c);
3914
return;
3915
}
3916
3917
/* If we're threaded and the last THRSW was in conditional code, then
3918
* we need to emit another one so that we can flag it as the last
3919
* thrsw.
3920
*/
3921
if (c->last_thrsw && !c->last_thrsw_at_top_level) {
3922
assert(c->devinfo->ver >= 41);
3923
vir_emit_thrsw(c);
3924
}
3925
3926
/* If we're threaded, then we need to mark the last THRSW instruction
3927
* so we can emit a pair of them at QPU emit time.
3928
*
3929
* For V3D 4.x, we can spawn the non-fragment shaders already in the
3930
* post-last-THRSW state, so we can skip this.
3931
*/
3932
if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
3933
assert(c->devinfo->ver >= 41);
3934
vir_emit_thrsw(c);
3935
}
3936
3937
if (c->last_thrsw)
3938
c->last_thrsw->is_last_thrsw = true;
3939
}
3940
3941
/* There's a flag in the shader for "center W is needed for reasons other than
3942
* non-centroid varyings", so we just walk the program after VIR optimization
3943
* to see if it's used. It should be harmless to set even if we only use
3944
* center W for varyings.
3945
*/
3946
static void
3947
vir_check_payload_w(struct v3d_compile *c)
3948
{
3949
if (c->s->info.stage != MESA_SHADER_FRAGMENT)
3950
return;
3951
3952
vir_for_each_inst_inorder(inst, c) {
3953
for (int i = 0; i < vir_get_nsrc(inst); i++) {
3954
if (inst->src[i].file == QFILE_REG &&
3955
inst->src[i].index == 0) {
3956
c->uses_center_w = true;
3957
return;
3958
}
3959
}
3960
}
3961
}
3962
3963
void
3964
v3d_nir_to_vir(struct v3d_compile *c)
3965
{
3966
if (V3D_DEBUG & (V3D_DEBUG_NIR |
3967
v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3968
fprintf(stderr, "%s prog %d/%d NIR:\n",
3969
vir_get_stage_name(c),
3970
c->program_id, c->variant_id);
3971
nir_print_shader(c->s, stderr);
3972
}
3973
3974
nir_to_vir(c);
3975
3976
/* Emit the last THRSW before STVPM and TLB writes. */
3977
vir_emit_last_thrsw(c);
3978
3979
switch (c->s->info.stage) {
3980
case MESA_SHADER_FRAGMENT:
3981
emit_frag_end(c);
3982
break;
3983
case MESA_SHADER_GEOMETRY:
3984
emit_geom_end(c);
3985
break;
3986
case MESA_SHADER_VERTEX:
3987
emit_vert_end(c);
3988
break;
3989
case MESA_SHADER_COMPUTE:
3990
break;
3991
default:
3992
unreachable("bad stage");
3993
}
3994
3995
if (V3D_DEBUG & (V3D_DEBUG_VIR |
3996
v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3997
fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
3998
vir_get_stage_name(c),
3999
c->program_id, c->variant_id);
4000
vir_dump(c);
4001
fprintf(stderr, "\n");
4002
}
4003
4004
vir_optimize(c);
4005
4006
vir_check_payload_w(c);
4007
4008
/* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4009
* We used that on that platform to pipeline TMU writes and reduce the
4010
* number of thread switches, as well as try (mostly successfully) to
4011
* reduce maximum register pressure to allow more threads. We should
4012
* do something of that sort for V3D -- either instruction scheduling
4013
* here, or delay the the THRSW and LDTMUs from our texture
4014
* instructions until the results are needed.
4015
*/
4016
4017
if (V3D_DEBUG & (V3D_DEBUG_VIR |
4018
v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4019
fprintf(stderr, "%s prog %d/%d VIR:\n",
4020
vir_get_stage_name(c),
4021
c->program_id, c->variant_id);
4022
vir_dump(c);
4023
fprintf(stderr, "\n");
4024
}
4025
4026
/* Attempt to allocate registers for the temporaries. If we fail,
4027
* reduce thread count and try again.
4028
*/
4029
int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
4030
struct qpu_reg *temp_registers;
4031
while (true) {
4032
bool spilled;
4033
temp_registers = v3d_register_allocate(c, &spilled);
4034
if (spilled)
4035
continue;
4036
4037
if (temp_registers)
4038
break;
4039
4040
if (c->threads == min_threads &&
4041
(V3D_DEBUG & V3D_DEBUG_RA)) {
4042
fprintf(stderr,
4043
"Failed to register allocate using %s\n",
4044
c->fallback_scheduler ? "the fallback scheduler:" :
4045
"the normal scheduler: \n");
4046
4047
vir_dump(c);
4048
4049
char *shaderdb;
4050
int ret = v3d_shaderdb_dump(c, &shaderdb);
4051
if (ret > 0) {
4052
fprintf(stderr, "%s\n", shaderdb);
4053
free(shaderdb);
4054
}
4055
}
4056
4057
if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
4058
if (V3D_DEBUG & V3D_DEBUG_PERF) {
4059
fprintf(stderr,
4060
"Failed to register allocate %s at "
4061
"%d threads.\n", vir_get_stage_name(c),
4062
c->threads);
4063
}
4064
c->compilation_result =
4065
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
4066
return;
4067
}
4068
4069
c->spill_count = 0;
4070
c->threads /= 2;
4071
4072
if (c->threads == 1)
4073
vir_remove_thrsw(c);
4074
}
4075
4076
if (c->spills &&
4077
(V3D_DEBUG & (V3D_DEBUG_VIR |
4078
v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
4079
fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
4080
vir_get_stage_name(c),
4081
c->program_id, c->variant_id);
4082
vir_dump(c);
4083
fprintf(stderr, "\n");
4084
}
4085
4086
v3d_vir_to_qpu(c, temp_registers);
4087
}
4088
4089