CoCalc -- nir_to

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/compiler/nir_to_vir.c
⁴⁵⁶⁴ views
1
/*
2
 * Copyright © 2016 Broadcom
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include <inttypes.h>
25
#include "util/format/u_format.h"
26
#include "util/u_helpers.h"
27
#include "util/u_math.h"
28
#include "util/u_memory.h"
29
#include "util/ralloc.h"
30
#include "util/hash_table.h"
31
#include "compiler/nir/nir.h"
32
#include "compiler/nir/nir_builder.h"
33
#include "common/v3d_device_info.h"
34
#include "v3d_compiler.h"
35

36
/* We don't do any address packing. */
37
#define __gen_user_data void
38
#define __gen_address_type uint32_t
39
#define __gen_address_offset(reloc) (*reloc)
40
#define __gen_emit_reloc(cl, reloc)
41
#include "cle/v3d_packet_v41_pack.h"
42

43
#define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44
#define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47
#define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48
#define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49
#define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52
#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53

54
#define V3D_TSY_SET_QUORUM          0
55
#define V3D_TSY_INC_WAITERS         1
56
#define V3D_TSY_DEC_WAITERS         2
57
#define V3D_TSY_INC_QUORUM          3
58
#define V3D_TSY_DEC_QUORUM          4
59
#define V3D_TSY_FREE_ALL            5
60
#define V3D_TSY_RELEASE             6
61
#define V3D_TSY_ACQUIRE             7
62
#define V3D_TSY_WAIT                8
63
#define V3D_TSY_WAIT_INC            9
64
#define V3D_TSY_WAIT_CHECK          10
65
#define V3D_TSY_WAIT_INC_CHECK      11
66
#define V3D_TSY_WAIT_CV             12
67
#define V3D_TSY_INC_SEMAPHORE       13
68
#define V3D_TSY_DEC_SEMAPHORE       14
69
#define V3D_TSY_SET_QUORUM_FREE_ALL 15
70

71
enum v3d_tmu_op_type
72
{
73
        V3D_TMU_OP_TYPE_REGULAR,
74
        V3D_TMU_OP_TYPE_ATOMIC,
75
        V3D_TMU_OP_TYPE_CACHE
76
};
77

78
static enum v3d_tmu_op_type
79
v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80
{
81
        switch(tmu_op) {
82
        case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83
        case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84
        case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85
        case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86
        case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87
                return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88
        case V3D_TMU_OP_WRITE_UMAX:
89
        case V3D_TMU_OP_WRITE_SMIN:
90
        case V3D_TMU_OP_WRITE_SMAX:
91
                assert(is_write);
92
                FALLTHROUGH;
93
        case V3D_TMU_OP_WRITE_AND_READ_INC:
94
        case V3D_TMU_OP_WRITE_OR_READ_DEC:
95
        case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96
                return V3D_TMU_OP_TYPE_ATOMIC;
97
        case V3D_TMU_OP_REGULAR:
98
                return V3D_TMU_OP_TYPE_REGULAR;
99

100
        default:
101
                unreachable("Unknown tmu_op\n");
102
        }
103
}
104
static void
105
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106

107
static void
108
resize_qreg_array(struct v3d_compile *c,
109
                  struct qreg **regs,
110
                  uint32_t *size,
111
                  uint32_t decl_size)
112
{
113
        if (*size >= decl_size)
114
                return;
115

116
        uint32_t old_size = *size;
117
        *size = MAX2(*size * 2, decl_size);
118
        *regs = reralloc(c, *regs, struct qreg, *size);
119
        if (!*regs) {
120
                fprintf(stderr, "Malloc failure\n");
121
                abort();
122
        }
123

124
        for (uint32_t i = old_size; i < *size; i++)
125
                (*regs)[i] = c->undef;
126
}
127

128
static void
129
resize_interp_array(struct v3d_compile *c,
130
                    struct v3d_interp_input **regs,
131
                    uint32_t *size,
132
                    uint32_t decl_size)
133
{
134
        if (*size >= decl_size)
135
                return;
136

137
        uint32_t old_size = *size;
138
        *size = MAX2(*size * 2, decl_size);
139
        *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140
        if (!*regs) {
141
                fprintf(stderr, "Malloc failure\n");
142
                abort();
143
        }
144

145
        for (uint32_t i = old_size; i < *size; i++) {
146
                (*regs)[i].vp = c->undef;
147
                (*regs)[i].C = c->undef;
148
        }
149
}
150

151
void
152
vir_emit_thrsw(struct v3d_compile *c)
153
{
154
        if (c->threads == 1)
155
                return;
156

157
        /* Always thread switch after each texture operation for now.
158
         *
159
         * We could do better by batching a bunch of texture fetches up and
160
         * then doing one thread switch and collecting all their results
161
         * afterward.
162
         */
163
        c->last_thrsw = vir_NOP(c);
164
        c->last_thrsw->qpu.sig.thrsw = true;
165
        c->last_thrsw_at_top_level = !c->in_control_flow;
166

167
        /* We need to lock the scoreboard before any tlb acess happens. If this
168
         * thread switch comes after we have emitted a tlb load, then it means
169
         * that we can't lock on the last thread switch any more.
170
         */
171
        if (c->emitted_tlb_load)
172
                c->lock_scoreboard_on_first_thrsw = true;
173
}
174

175
uint32_t
176
v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177
{
178
        if (nir_src_is_const(instr->src[src])) {
179
                int64_t add_val = nir_src_as_int(instr->src[src]);
180
                if (add_val == 1)
181
                        return V3D_TMU_OP_WRITE_AND_READ_INC;
182
                else if (add_val == -1)
183
                        return V3D_TMU_OP_WRITE_OR_READ_DEC;
184
        }
185

186
        return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187
}
188

189
static uint32_t
190
v3d_general_tmu_op(nir_intrinsic_instr *instr)
191
{
192
        switch (instr->intrinsic) {
193
        case nir_intrinsic_load_ssbo:
194
        case nir_intrinsic_load_ubo:
195
        case nir_intrinsic_load_uniform:
196
        case nir_intrinsic_load_shared:
197
        case nir_intrinsic_load_scratch:
198
        case nir_intrinsic_store_ssbo:
199
        case nir_intrinsic_store_shared:
200
        case nir_intrinsic_store_scratch:
201
                return V3D_TMU_OP_REGULAR;
202
        case nir_intrinsic_ssbo_atomic_add:
203
                return v3d_get_op_for_atomic_add(instr, 2);
204
        case nir_intrinsic_shared_atomic_add:
205
                return v3d_get_op_for_atomic_add(instr, 1);
206
        case nir_intrinsic_ssbo_atomic_imin:
207
        case nir_intrinsic_shared_atomic_imin:
208
                return V3D_TMU_OP_WRITE_SMIN;
209
        case nir_intrinsic_ssbo_atomic_umin:
210
        case nir_intrinsic_shared_atomic_umin:
211
                return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
212
        case nir_intrinsic_ssbo_atomic_imax:
213
        case nir_intrinsic_shared_atomic_imax:
214
                return V3D_TMU_OP_WRITE_SMAX;
215
        case nir_intrinsic_ssbo_atomic_umax:
216
        case nir_intrinsic_shared_atomic_umax:
217
                return V3D_TMU_OP_WRITE_UMAX;
218
        case nir_intrinsic_ssbo_atomic_and:
219
        case nir_intrinsic_shared_atomic_and:
220
                return V3D_TMU_OP_WRITE_AND_READ_INC;
221
        case nir_intrinsic_ssbo_atomic_or:
222
        case nir_intrinsic_shared_atomic_or:
223
                return V3D_TMU_OP_WRITE_OR_READ_DEC;
224
        case nir_intrinsic_ssbo_atomic_xor:
225
        case nir_intrinsic_shared_atomic_xor:
226
                return V3D_TMU_OP_WRITE_XOR_READ_NOT;
227
        case nir_intrinsic_ssbo_atomic_exchange:
228
        case nir_intrinsic_shared_atomic_exchange:
229
                return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
230
        case nir_intrinsic_ssbo_atomic_comp_swap:
231
        case nir_intrinsic_shared_atomic_comp_swap:
232
                return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
233
        default:
234
                unreachable("unknown intrinsic op");
235
        }
236
}
237

238
/**
239
 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
240
 * would overflow the Output TMU fifo.
241
 *
242
 * It is not allowed to overflow the Output fifo, however, we can overflow
243
 * Input and Config fifos. Doing that makes the shader stall, but only for as
244
 * long as it needs to be able to continue so it is better for pipelining to
245
 * let the QPU stall on these if needed than trying to emit TMU flushes in the
246
 * driver.
247
 */
248
bool
249
ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
250
{
251
        if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
252
                return true;
253

254
        return components > 0 &&
255
               c->tmu.output_fifo_size + components > 16 / c->threads;
256
}
257

258
/**
259
 * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
260
 * popping all TMU fifo entries.
261
 */
262
void
263
ntq_flush_tmu(struct v3d_compile *c)
264
{
265
        if (c->tmu.flush_count == 0)
266
                return;
267

268
        vir_emit_thrsw(c);
269

270
        bool emitted_tmuwt = false;
271
        for (int i = 0; i < c->tmu.flush_count; i++) {
272
                if (c->tmu.flush[i].component_mask > 0) {
273
                        nir_dest *dest = c->tmu.flush[i].dest;
274
                        assert(dest);
275

276
                        for (int j = 0; j < 4; j++) {
277
                                if (c->tmu.flush[i].component_mask & (1 << j)) {
278
                                        ntq_store_dest(c, dest, j,
279
                                                       vir_MOV(c, vir_LDTMU(c)));
280
                                }
281
                        }
282
                } else if (!emitted_tmuwt) {
283
                        vir_TMUWT(c);
284
                        emitted_tmuwt = true;
285
                }
286
        }
287

288
        c->tmu.output_fifo_size = 0;
289
        c->tmu.flush_count = 0;
290
        _mesa_set_clear(c->tmu.outstanding_regs, NULL);
291
}
292

293
/**
294
 * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
295
 * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
296
 * and more specifically, the output fifo, since that can't stall.
297
 */
298
void
299
ntq_add_pending_tmu_flush(struct v3d_compile *c,
300
                          nir_dest *dest,
301
                          uint32_t component_mask)
302
{
303
        const uint32_t num_components = util_bitcount(component_mask);
304
        assert(!ntq_tmu_fifo_overflow(c, num_components));
305

306
        if (num_components > 0) {
307
                c->tmu.output_fifo_size += num_components;
308
                if (!dest->is_ssa)
309
                        _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
310
        }
311

312
        c->tmu.flush[c->tmu.flush_count].dest = dest;
313
        c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
314
        c->tmu.flush_count++;
315

316
        if (c->disable_tmu_pipelining)
317
                ntq_flush_tmu(c);
318
        else if (c->tmu.flush_count > 1)
319
                c->pipelined_any_tmu = true;
320
}
321

322
enum emit_mode {
323
    MODE_COUNT = 0,
324
    MODE_EMIT,
325
    MODE_LAST,
326
};
327

328
/**
329
 * For a TMU general store instruction:
330
 *
331
 * In MODE_COUNT mode, records the number of TMU writes required and flushes
332
 * any outstanding TMU operations the instruction depends on, but it doesn't
333
 * emit any actual register writes.
334
 *
335
 * In MODE_EMIT mode, emits the data register writes required by the
336
 * instruction.
337
 */
338
static void
339
emit_tmu_general_store_writes(struct v3d_compile *c,
340
                              enum emit_mode mode,
341
                              nir_intrinsic_instr *instr,
342
                              uint32_t base_const_offset,
343
                              uint32_t *writemask,
344
                              uint32_t *const_offset,
345
                              uint32_t *tmu_writes)
346
{
347
        struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
348

349
        /* Find the first set of consecutive components that
350
         * are enabled in the writemask and emit the TMUD
351
         * instructions for them.
352
         */
353
        assert(*writemask != 0);
354
        uint32_t first_component = ffs(*writemask) - 1;
355
        uint32_t last_component = first_component;
356
        while (*writemask & BITFIELD_BIT(last_component + 1))
357
                last_component++;
358

359
        assert(first_component <= last_component &&
360
               last_component < instr->num_components);
361

362
        for (int i = first_component; i <= last_component; i++) {
363
                struct qreg data = ntq_get_src(c, instr->src[0], i);
364
                if (mode == MODE_COUNT)
365
                        (*tmu_writes)++;
366
                else
367
                        vir_MOV_dest(c, tmud, data);
368
        }
369

370
        if (mode == MODE_EMIT) {
371
                /* Update the offset for the TMU write based on the
372
                 * the first component we are writing.
373
                 */
374
                *const_offset = base_const_offset + first_component * 4;
375

376
                /* Clear these components from the writemask */
377
                uint32_t written_mask =
378
                        BITFIELD_RANGE(first_component, *tmu_writes);
379
                (*writemask) &= ~written_mask;
380
        }
381
}
382

383
/**
384
 * For a TMU general atomic instruction:
385
 *
386
 * In MODE_COUNT mode, records the number of TMU writes required and flushes
387
 * any outstanding TMU operations the instruction depends on, but it doesn't
388
 * emit any actual register writes.
389
 *
390
 * In MODE_EMIT mode, emits the data register writes required by the
391
 * instruction.
392
 */
393
static void
394
emit_tmu_general_atomic_writes(struct v3d_compile *c,
395
                               enum emit_mode mode,
396
                               nir_intrinsic_instr *instr,
397
                               uint32_t tmu_op,
398
                               bool has_index,
399
                               uint32_t *tmu_writes)
400
{
401
        struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
402

403
        struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
404
        if (mode == MODE_COUNT)
405
                (*tmu_writes)++;
406
        else
407
                vir_MOV_dest(c, tmud, data);
408

409
        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
410
                data = ntq_get_src(c, instr->src[2 + has_index], 0);
411
                if (mode == MODE_COUNT)
412
                        (*tmu_writes)++;
413
                else
414
                        vir_MOV_dest(c, tmud, data);
415
        }
416
}
417

418
/**
419
 * For any TMU general instruction:
420
 *
421
 * In MODE_COUNT mode, records the number of TMU writes required to emit the
422
 * address parameter and flushes any outstanding TMU operations the instruction
423
 * depends on, but it doesn't emit any actual register writes.
424
 *
425
 * In MODE_EMIT mode, emits register writes required to emit the address.
426
 */
427
static void
428
emit_tmu_general_address_write(struct v3d_compile *c,
429
                               enum emit_mode mode,
430
                               nir_intrinsic_instr *instr,
431
                               uint32_t config,
432
                               bool dynamic_src,
433
                               int offset_src,
434
                               struct qreg base_offset,
435
                               uint32_t const_offset,
436
                               uint32_t *tmu_writes)
437
{
438
        if (mode == MODE_COUNT) {
439
                (*tmu_writes)++;
440
                if (dynamic_src)
441
                        ntq_get_src(c, instr->src[offset_src], 0);
442
                return;
443
        }
444

445
        if (vir_in_nonuniform_control_flow(c)) {
446
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
447
                           V3D_QPU_PF_PUSHZ);
448
        }
449

450
        struct qreg tmua;
451
        if (config == ~0)
452
                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
453
        else
454
                tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
455

456
        struct qinst *tmu;
457
        if (dynamic_src) {
458
                struct qreg offset = base_offset;
459
                if (const_offset != 0) {
460
                        offset = vir_ADD(c, offset,
461
                                         vir_uniform_ui(c, const_offset));
462
                }
463
                struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
464
                tmu = vir_ADD_dest(c, tmua, offset, data);
465
        } else {
466
                if (const_offset != 0) {
467
                        tmu = vir_ADD_dest(c, tmua, base_offset,
468
                                           vir_uniform_ui(c, const_offset));
469
                } else {
470
                        tmu = vir_MOV_dest(c, tmua, base_offset);
471
                }
472
        }
473

474
        if (config != ~0) {
475
                tmu->uniform =
476
                        vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
477
        }
478

479
        if (vir_in_nonuniform_control_flow(c))
480
                vir_set_cond(tmu, V3D_QPU_COND_IFA);
481
}
482

483
/**
484
 * Implements indirect uniform loads and SSBO accesses through the TMU general
485
 * memory access interface.
486
 */
487
static void
488
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
489
                     bool is_shared_or_scratch)
490
{
491
        uint32_t tmu_op = v3d_general_tmu_op(instr);
492

493
        /* If we were able to replace atomic_add for an inc/dec, then we
494
         * need/can to do things slightly different, like not loading the
495
         * amount to add/sub, as that is implicit.
496
         */
497
        bool atomic_add_replaced =
498
                ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
499
                  instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
500
                 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
501
                  tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
502

503
        bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
504
                         instr->intrinsic == nir_intrinsic_store_scratch ||
505
                         instr->intrinsic == nir_intrinsic_store_shared);
506

507
        bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
508
                        instr->intrinsic == nir_intrinsic_load_ubo ||
509
                        instr->intrinsic == nir_intrinsic_load_ssbo ||
510
                        instr->intrinsic == nir_intrinsic_load_scratch ||
511
                        instr->intrinsic == nir_intrinsic_load_shared);
512

513
        if (!is_load)
514
                c->tmu_dirty_rcl = true;
515

516
        bool has_index = !is_shared_or_scratch;
517

518
        int offset_src;
519
        if (instr->intrinsic == nir_intrinsic_load_uniform) {
520
                offset_src = 0;
521
        } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
522
                   instr->intrinsic == nir_intrinsic_load_ubo ||
523
                   instr->intrinsic == nir_intrinsic_load_scratch ||
524
                   instr->intrinsic == nir_intrinsic_load_shared ||
525
                   atomic_add_replaced) {
526
                offset_src = 0 + has_index;
527
        } else if (is_store) {
528
                offset_src = 1 + has_index;
529
        } else {
530
                offset_src = 0 + has_index;
531
        }
532

533
        bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
534
        uint32_t const_offset = 0;
535
        if (!dynamic_src)
536
                const_offset = nir_src_as_uint(instr->src[offset_src]);
537

538
        struct qreg base_offset;
539
        if (instr->intrinsic == nir_intrinsic_load_uniform) {
540
                const_offset += nir_intrinsic_base(instr);
541
                base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
542
                                          v3d_unit_data_create(0, const_offset));
543
                const_offset = 0;
544
        } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
545
                uint32_t index = nir_src_as_uint(instr->src[0]);
546
                /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
547
                 * shifted up by 1 (0 is gallium's constant buffer 0).
548
                 */
549
                if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
550
                        index++;
551

552
                base_offset =
553
                        vir_uniform(c, QUNIFORM_UBO_ADDR,
554
                                    v3d_unit_data_create(index, const_offset));
555
                const_offset = 0;
556
        } else if (is_shared_or_scratch) {
557
                /* Shared and scratch variables have no buffer index, and all
558
                 * start from a common base that we set up at the start of
559
                 * dispatch.
560
                 */
561
                if (instr->intrinsic == nir_intrinsic_load_scratch ||
562
                    instr->intrinsic == nir_intrinsic_store_scratch) {
563
                        base_offset = c->spill_base;
564
                } else {
565
                        base_offset = c->cs_shared_offset;
566
                        const_offset += nir_intrinsic_base(instr);
567
                }
568
        } else {
569
                base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
570
                                          nir_src_as_uint(instr->src[is_store ?
571
                                                                      1 : 0]));
572
        }
573

574
        /* We are ready to emit TMU register writes now, but before we actually
575
         * emit them we need to flush outstanding TMU operations if any of our
576
         * writes reads from the result of an outstanding TMU operation before
577
         * we start the TMU sequence for this operation, since otherwise the
578
         * flush could happen in the middle of the TMU sequence we are about to
579
         * emit, which is illegal. To do this we run this logic twice, the
580
         * first time it will count required register writes and flush pending
581
         * TMU requests if necessary due to a dependency, and the second one
582
         * will emit the actual TMU writes.
583
         */
584
        const uint32_t dest_components = nir_intrinsic_dest_components(instr);
585
        uint32_t base_const_offset = const_offset;
586
        uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
587
        uint32_t tmu_writes = 0;
588
        for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
589
                assert(mode == MODE_COUNT || tmu_writes > 0);
590

591
                if (is_store) {
592
                        emit_tmu_general_store_writes(c, mode, instr,
593
                                                      base_const_offset,
594
                                                      &writemask,
595
                                                      &const_offset,
596
                                                      &tmu_writes);
597
                } else if (!is_load && !atomic_add_replaced) {
598
                         emit_tmu_general_atomic_writes(c, mode, instr,
599
                                                        tmu_op, has_index,
600
                                                        &tmu_writes);
601
                }
602

603
                /* For atomics we use 32bit except for CMPXCHG, that we need
604
                 * to use VEC2. For the rest of the cases we use the number of
605
                 * tmud writes we did to decide the type. For cache operations
606
                 * the type is ignored.
607
                 */
608
                uint32_t config = 0;
609
                if (mode == MODE_EMIT) {
610
                        uint32_t num_components;
611
                        if (is_load || atomic_add_replaced) {
612
                                num_components = instr->num_components;
613
                        } else {
614
                                assert(tmu_writes > 0);
615
                                num_components = tmu_writes - 1;
616
                        }
617
                        bool is_atomic =
618
                                v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
619
                                V3D_TMU_OP_TYPE_ATOMIC;
620

621
                        uint32_t perquad =
622
                                is_load && !vir_in_nonuniform_control_flow(c)
623
                                ? GENERAL_TMU_LOOKUP_PER_QUAD
624
                                : GENERAL_TMU_LOOKUP_PER_PIXEL;
625
                        config = 0xffffff00 | tmu_op << 3 | perquad;
626

627
                        if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
628
                                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
629
                        } else if (is_atomic || num_components == 1) {
630
                                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
631
                        } else {
632
                                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
633
                                          num_components - 2;
634
                        }
635
                }
636

637
                emit_tmu_general_address_write(c, mode, instr, config,
638
                                               dynamic_src, offset_src,
639
                                               base_offset, const_offset,
640
                                               &tmu_writes);
641

642
                assert(tmu_writes > 0);
643
                if (mode == MODE_COUNT) {
644
                        /* Make sure we won't exceed the 16-entry TMU
645
                         * fifo if each thread is storing at the same
646
                         * time.
647
                         */
648
                        while (tmu_writes > 16 / c->threads)
649
                                c->threads /= 2;
650

651
                        /* If pipelining this TMU operation would
652
                         * overflow TMU fifos, we need to flush.
653
                         */
654
                        if (ntq_tmu_fifo_overflow(c, dest_components))
655
                                ntq_flush_tmu(c);
656
                } else {
657
                        /* Delay emission of the thread switch and
658
                         * LDTMU/TMUWT until we really need to do it to
659
                         * improve pipelining.
660
                         */
661
                        const uint32_t component_mask =
662
                                (1 << dest_components) - 1;
663
                        ntq_add_pending_tmu_flush(c, &instr->dest,
664
                                                  component_mask);
665
                }
666
        }
667

668
        /* nir_lower_wrmasks should've ensured that any writemask on a store
669
         * operation only has consecutive bits set, in which case we should've
670
         * processed the full writemask above.
671
         */
672
        assert(writemask == 0);
673
}
674

675
static struct qreg *
676
ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
677
{
678
        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
679
                                          def->num_components);
680
        _mesa_hash_table_insert(c->def_ht, def, qregs);
681
        return qregs;
682
}
683

684
static bool
685
is_ld_signal(const struct v3d_qpu_sig *sig)
686
{
687
        return (sig->ldunif ||
688
                sig->ldunifa ||
689
                sig->ldunifrf ||
690
                sig->ldunifarf ||
691
                sig->ldtmu ||
692
                sig->ldvary ||
693
                sig->ldvpm ||
694
                sig->ldtlb ||
695
                sig->ldtlbu);
696
}
697

698
static inline bool
699
is_ldunif_signal(const struct v3d_qpu_sig *sig)
700
{
701
        return sig->ldunif || sig->ldunifrf;
702
}
703

704
/**
705
 * This function is responsible for getting VIR results into the associated
706
 * storage for a NIR instruction.
707
 *
708
 * If it's a NIR SSA def, then we just set the associated hash table entry to
709
 * the new result.
710
 *
711
 * If it's a NIR reg, then we need to update the existing qreg assigned to the
712
 * NIR destination with the incoming value.  To do that without introducing
713
 * new MOVs, we require that the incoming qreg either be a uniform, or be
714
 * SSA-defined by the previous VIR instruction in the block and rewritable by
715
 * this function.  That lets us sneak ahead and insert the SF flag beforehand
716
 * (knowing that the previous instruction doesn't depend on flags) and rewrite
717
 * its destination to be the NIR reg's destination
718
 */
719
void
720
ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
721
               struct qreg result)
722
{
723
        struct qinst *last_inst = NULL;
724
        if (!list_is_empty(&c->cur_block->instructions))
725
                last_inst = (struct qinst *)c->cur_block->instructions.prev;
726

727
        bool is_reused_uniform =
728
                is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
729
                last_inst != c->defs[result.index];
730

731
        assert(result.file == QFILE_TEMP && last_inst &&
732
               (last_inst == c->defs[result.index] || is_reused_uniform));
733

734
        if (dest->is_ssa) {
735
                assert(chan < dest->ssa.num_components);
736

737
                struct qreg *qregs;
738
                struct hash_entry *entry =
739
                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
740

741
                if (entry)
742
                        qregs = entry->data;
743
                else
744
                        qregs = ntq_init_ssa_def(c, &dest->ssa);
745

746
                qregs[chan] = result;
747
        } else {
748
                nir_register *reg = dest->reg.reg;
749
                assert(dest->reg.base_offset == 0);
750
                assert(reg->num_array_elems == 0);
751
                struct hash_entry *entry =
752
                        _mesa_hash_table_search(c->def_ht, reg);
753
                struct qreg *qregs = entry->data;
754

755
                /* If the previous instruction can't be predicated for
756
                 * the store into the nir_register, then emit a MOV
757
                 * that can be.
758
                 */
759
                if (is_reused_uniform ||
760
                    (vir_in_nonuniform_control_flow(c) &&
761
                     is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
762
                        result = vir_MOV(c, result);
763
                        last_inst = c->defs[result.index];
764
                }
765

766
                /* We know they're both temps, so just rewrite index. */
767
                c->defs[last_inst->dst.index] = NULL;
768
                last_inst->dst.index = qregs[chan].index;
769

770
                /* If we're in control flow, then make this update of the reg
771
                 * conditional on the execution mask.
772
                 */
773
                if (vir_in_nonuniform_control_flow(c)) {
774
                        last_inst->dst.index = qregs[chan].index;
775

776
                        /* Set the flags to the current exec mask.
777
                         */
778
                        c->cursor = vir_before_inst(last_inst);
779
                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
780
                                   V3D_QPU_PF_PUSHZ);
781
                        c->cursor = vir_after_inst(last_inst);
782

783
                        vir_set_cond(last_inst, V3D_QPU_COND_IFA);
784
                }
785
        }
786
}
787

788
/**
789
 * This looks up the qreg associated with a particular ssa/reg used as a source
790
 * in any instruction.
791
 *
792
 * It is expected that the definition for any NIR value read as a source has
793
 * been emitted by a previous instruction, however, in the case of TMU
794
 * operations we may have postponed emission of the thread switch and LDTMUs
795
 * required to read the TMU results until the results are actually used to
796
 * improve pipelining, which then would lead to us not finding them here
797
 * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
798
 * (for registers), meaning that we need to flush outstanding TMU operations
799
 * to read the correct value.
800
 */
801
struct qreg
802
ntq_get_src(struct v3d_compile *c, nir_src src, int i)
803
{
804
        struct hash_entry *entry;
805
        if (src.is_ssa) {
806
                assert(i < src.ssa->num_components);
807

808
                entry = _mesa_hash_table_search(c->def_ht, src.ssa);
809
                if (!entry) {
810
                        ntq_flush_tmu(c);
811
                        entry = _mesa_hash_table_search(c->def_ht, src.ssa);
812
                }
813
        } else {
814
                nir_register *reg = src.reg.reg;
815
                assert(reg->num_array_elems == 0);
816
                assert(src.reg.base_offset == 0);
817
                assert(i < reg->num_components);
818

819
                if (_mesa_set_search(c->tmu.outstanding_regs, reg))
820
                        ntq_flush_tmu(c);
821
                entry = _mesa_hash_table_search(c->def_ht, reg);
822
        }
823
        assert(entry);
824

825
        struct qreg *qregs = entry->data;
826
        return qregs[i];
827
}
828

829
static struct qreg
830
ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
831
                unsigned src)
832
{
833
        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
834
        unsigned chan = ffs(instr->dest.write_mask) - 1;
835
        struct qreg r = ntq_get_src(c, instr->src[src].src,
836
                                    instr->src[src].swizzle[chan]);
837

838
        assert(!instr->src[src].abs);
839
        assert(!instr->src[src].negate);
840

841
        return r;
842
};
843

844
static struct qreg
845
ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
846
{
847
        return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
848
}
849

850
static void
851
ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
852
{
853
        unsigned unit = instr->texture_index;
854
        int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
855
        int dest_size = nir_tex_instr_dest_size(instr);
856

857
        struct qreg lod = c->undef;
858
        if (lod_index != -1)
859
                lod = ntq_get_src(c, instr->src[lod_index].src, 0);
860

861
        for (int i = 0; i < dest_size; i++) {
862
                assert(i < 3);
863
                enum quniform_contents contents;
864

865
                if (instr->is_array && i == dest_size - 1)
866
                        contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
867
                else
868
                        contents = QUNIFORM_TEXTURE_WIDTH + i;
869

870
                struct qreg size = vir_uniform(c, contents, unit);
871

872
                switch (instr->sampler_dim) {
873
                case GLSL_SAMPLER_DIM_1D:
874
                case GLSL_SAMPLER_DIM_2D:
875
                case GLSL_SAMPLER_DIM_MS:
876
                case GLSL_SAMPLER_DIM_3D:
877
                case GLSL_SAMPLER_DIM_CUBE:
878
                case GLSL_SAMPLER_DIM_BUF:
879
                        /* Don't minify the array size. */
880
                        if (!(instr->is_array && i == dest_size - 1)) {
881
                                size = ntq_minify(c, size, lod);
882
                        }
883
                        break;
884

885
                case GLSL_SAMPLER_DIM_RECT:
886
                        /* There's no LOD field for rects */
887
                        break;
888

889
                default:
890
                        unreachable("Bad sampler type");
891
                }
892

893
                ntq_store_dest(c, &instr->dest, i, size);
894
        }
895
}
896

897
static void
898
ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
899
{
900
        unsigned unit = instr->texture_index;
901

902
        /* Since each texture sampling op requires uploading uniforms to
903
         * reference the texture, there's no HW support for texture size and
904
         * you just upload uniforms containing the size.
905
         */
906
        switch (instr->op) {
907
        case nir_texop_query_levels:
908
                ntq_store_dest(c, &instr->dest, 0,
909
                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
910
                return;
911
        case nir_texop_texture_samples:
912
                ntq_store_dest(c, &instr->dest, 0,
913
                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
914
                return;
915
        case nir_texop_txs:
916
                ntq_emit_txs(c, instr);
917
                return;
918
        default:
919
                break;
920
        }
921

922
        if (c->devinfo->ver >= 40)
923
                v3d40_vir_emit_tex(c, instr);
924
        else
925
                v3d33_vir_emit_tex(c, instr);
926
}
927

928
static struct qreg
929
ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
930
{
931
        struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
932
        if (is_cos)
933
                input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
934

935
        struct qreg periods = vir_FROUND(c, input);
936
        struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
937
        return vir_XOR(c, sin_output, vir_SHL(c,
938
                                              vir_FTOIN(c, periods),
939
                                              vir_uniform_ui(c, -1)));
940
}
941

942
static struct qreg
943
ntq_fsign(struct v3d_compile *c, struct qreg src)
944
{
945
        struct qreg t = vir_get_temp(c);
946

947
        vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
948
        vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
949
        vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
950
        vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
951
        vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
952
        return vir_MOV(c, t);
953
}
954

955
static void
956
emit_fragcoord_input(struct v3d_compile *c, int attr)
957
{
958
        c->inputs[attr * 4 + 0] = vir_FXCD(c);
959
        c->inputs[attr * 4 + 1] = vir_FYCD(c);
960
        c->inputs[attr * 4 + 2] = c->payload_z;
961
        c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
962
}
963

964
static struct qreg
965
emit_smooth_varying(struct v3d_compile *c,
966
                    struct qreg vary, struct qreg w, struct qreg r5)
967
{
968
        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
969
}
970

971
static struct qreg
972
emit_noperspective_varying(struct v3d_compile *c,
973
                           struct qreg vary, struct qreg r5)
974
{
975
        return vir_FADD(c, vir_MOV(c, vary), r5);
976
}
977

978
static struct qreg
979
emit_flat_varying(struct v3d_compile *c,
980
                  struct qreg vary, struct qreg r5)
981
{
982
        vir_MOV_dest(c, c->undef, vary);
983
        return vir_MOV(c, r5);
984
}
985

986
static struct qreg
987
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
988
                      int8_t input_idx, uint8_t swizzle, int array_index)
989
{
990
        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
991
        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
992

993
        struct qinst *ldvary = NULL;
994
        struct qreg vary;
995
        if (c->devinfo->ver >= 41) {
996
                ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
997
                                      c->undef, c->undef);
998
                ldvary->qpu.sig.ldvary = true;
999
                vary = vir_emit_def(c, ldvary);
1000
        } else {
1001
                vir_NOP(c)->qpu.sig.ldvary = true;
1002
                vary = r3;
1003
        }
1004

1005
        /* Store the input value before interpolation so we can implement
1006
         * GLSL's interpolateAt functions if the shader uses them.
1007
         */
1008
        if (input_idx >= 0) {
1009
                assert(var);
1010
                c->interp[input_idx].vp = vary;
1011
                c->interp[input_idx].C = vir_MOV(c, r5);
1012
                c->interp[input_idx].mode = var->data.interpolation;
1013
        }
1014

1015
        /* For gl_PointCoord input or distance along a line, we'll be called
1016
         * with no nir_variable, and we don't count toward VPM size so we
1017
         * don't track an input slot.
1018
         */
1019
        if (!var) {
1020
                assert(input_idx < 0);
1021
                return emit_smooth_varying(c, vary, c->payload_w, r5);
1022
        }
1023

1024
        int i = c->num_inputs++;
1025
        c->input_slots[i] =
1026
                v3d_slot_from_slot_and_component(var->data.location +
1027
                                                 array_index, swizzle);
1028

1029
        struct qreg result;
1030
        switch (var->data.interpolation) {
1031
        case INTERP_MODE_NONE:
1032
        case INTERP_MODE_SMOOTH:
1033
                if (var->data.centroid) {
1034
                        BITSET_SET(c->centroid_flags, i);
1035
                        result = emit_smooth_varying(c, vary,
1036
                                                     c->payload_w_centroid, r5);
1037
                } else {
1038
                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
1039
                }
1040
                break;
1041

1042
        case INTERP_MODE_NOPERSPECTIVE:
1043
                BITSET_SET(c->noperspective_flags, i);
1044
                result = emit_noperspective_varying(c, vary, r5);
1045
                break;
1046

1047
        case INTERP_MODE_FLAT:
1048
                BITSET_SET(c->flat_shade_flags, i);
1049
                result = emit_flat_varying(c, vary, r5);
1050
                break;
1051

1052
        default:
1053
                unreachable("Bad interp mode");
1054
        }
1055

1056
        if (input_idx >= 0)
1057
                c->inputs[input_idx] = result;
1058
        return result;
1059
}
1060

1061
static void
1062
emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1063
                    int array_index, unsigned nelem)
1064
{
1065
        for (int i = 0; i < nelem ; i++) {
1066
                int chan = var->data.location_frac + i;
1067
                int input_idx = (base_attr + array_index) * 4 + chan;
1068
                emit_fragment_varying(c, var, input_idx, chan, array_index);
1069
        }
1070
}
1071

1072
static void
1073
emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1074
                            int array_index)
1075
{
1076
        /* Compact variables are scalar arrays where each set of 4 elements
1077
         * consumes a single location.
1078
         */
1079
        int loc_offset = array_index / 4;
1080
        int chan = var->data.location_frac + array_index % 4;
1081
        int input_idx = (attr + loc_offset) * 4  + chan;
1082
        emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1083
}
1084

1085
static void
1086
add_output(struct v3d_compile *c,
1087
           uint32_t decl_offset,
1088
           uint8_t slot,
1089
           uint8_t swizzle)
1090
{
1091
        uint32_t old_array_size = c->outputs_array_size;
1092
        resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1093
                          decl_offset + 1);
1094

1095
        if (old_array_size != c->outputs_array_size) {
1096
                c->output_slots = reralloc(c,
1097
                                           c->output_slots,
1098
                                           struct v3d_varying_slot,
1099
                                           c->outputs_array_size);
1100
        }
1101

1102
        c->output_slots[decl_offset] =
1103
                v3d_slot_from_slot_and_component(slot, swizzle);
1104
}
1105

1106
/**
1107
 * If compare_instr is a valid comparison instruction, emits the
1108
 * compare_instr's comparison and returns the sel_instr's return value based
1109
 * on the compare_instr's result.
1110
 */
1111
static bool
1112
ntq_emit_comparison(struct v3d_compile *c,
1113
                    nir_alu_instr *compare_instr,
1114
                    enum v3d_qpu_cond *out_cond)
1115
{
1116
        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1117
        struct qreg src1;
1118
        if (nir_op_infos[compare_instr->op].num_inputs > 1)
1119
                src1 = ntq_get_alu_src(c, compare_instr, 1);
1120
        bool cond_invert = false;
1121
        struct qreg nop = vir_nop_reg();
1122

1123
        switch (compare_instr->op) {
1124
        case nir_op_feq32:
1125
        case nir_op_seq:
1126
                vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1127
                break;
1128
        case nir_op_ieq32:
1129
                vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1130
                break;
1131

1132
        case nir_op_fneu32:
1133
        case nir_op_sne:
1134
                vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1135
                cond_invert = true;
1136
                break;
1137
        case nir_op_ine32:
1138
                vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1139
                cond_invert = true;
1140
                break;
1141

1142
        case nir_op_fge32:
1143
        case nir_op_sge:
1144
                vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1145
                break;
1146
        case nir_op_ige32:
1147
                vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1148
                cond_invert = true;
1149
                break;
1150
        case nir_op_uge32:
1151
                vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1152
                cond_invert = true;
1153
                break;
1154

1155
        case nir_op_slt:
1156
        case nir_op_flt32:
1157
                vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1158
                break;
1159
        case nir_op_ilt32:
1160
                vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1161
                break;
1162
        case nir_op_ult32:
1163
                vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1164
                break;
1165

1166
        case nir_op_i2b32:
1167
                vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1168
                cond_invert = true;
1169
                break;
1170

1171
        case nir_op_f2b32:
1172
                vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1173
                cond_invert = true;
1174
                break;
1175

1176
        default:
1177
                return false;
1178
        }
1179

1180
        *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1181

1182
        return true;
1183
}
1184

1185
/* Finds an ALU instruction that generates our src value that could
1186
 * (potentially) be greedily emitted in the consuming instruction.
1187
 */
1188
static struct nir_alu_instr *
1189
ntq_get_alu_parent(nir_src src)
1190
{
1191
        if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
1192
                return NULL;
1193
        nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1194
        if (!instr)
1195
                return NULL;
1196

1197
        /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1198
         * moving emission of the ALU instr down past another write of the
1199
         * src.
1200
         */
1201
        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1202
                if (!instr->src[i].src.is_ssa)
1203
                        return NULL;
1204
        }
1205

1206
        return instr;
1207
}
1208

1209
/* Turns a NIR bool into a condition code to predicate on. */
1210
static enum v3d_qpu_cond
1211
ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1212
{
1213
        struct qreg qsrc = ntq_get_src(c, src, 0);
1214
        /* skip if we already have src in the flags */
1215
        if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1216
                return c->flags_cond;
1217

1218
        nir_alu_instr *compare = ntq_get_alu_parent(src);
1219
        if (!compare)
1220
                goto out;
1221

1222
        enum v3d_qpu_cond cond;
1223
        if (ntq_emit_comparison(c, compare, &cond))
1224
                return cond;
1225

1226
out:
1227

1228
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1229
                   V3D_QPU_PF_PUSHZ);
1230
        return V3D_QPU_COND_IFNA;
1231
}
1232

1233
static struct qreg
1234
ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1235
{
1236
        struct qreg result =
1237
                vir_MOV(c, vir_SEL(c, cond,
1238
                                   vir_uniform_ui(c, ~0),
1239
                                   vir_uniform_ui(c, 0)));
1240
        c->flags_temp = result.index;
1241
        c->flags_cond = cond;
1242
        return result;
1243
}
1244

1245
static void
1246
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1247
{
1248
        /* This should always be lowered to ALU operations for V3D. */
1249
        assert(!instr->dest.saturate);
1250

1251
        /* Vectors are special in that they have non-scalarized writemasks,
1252
         * and just take the first swizzle channel for each argument in order
1253
         * into each writemask channel.
1254
         */
1255
        if (instr->op == nir_op_vec2 ||
1256
            instr->op == nir_op_vec3 ||
1257
            instr->op == nir_op_vec4) {
1258
                struct qreg srcs[4];
1259
                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1260
                        srcs[i] = ntq_get_src(c, instr->src[i].src,
1261
                                              instr->src[i].swizzle[0]);
1262
                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1263
                        ntq_store_dest(c, &instr->dest.dest, i,
1264
                                       vir_MOV(c, srcs[i]));
1265
                return;
1266
        }
1267

1268
        /* General case: We can just grab the one used channel per src. */
1269
        struct qreg src[nir_op_infos[instr->op].num_inputs];
1270
        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1271
                src[i] = ntq_get_alu_src(c, instr, i);
1272
        }
1273

1274
        struct qreg result;
1275

1276
        switch (instr->op) {
1277
        case nir_op_mov:
1278
                result = vir_MOV(c, src[0]);
1279
                break;
1280

1281
        case nir_op_fneg:
1282
                result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
1283
                break;
1284
        case nir_op_ineg:
1285
                result = vir_NEG(c, src[0]);
1286
                break;
1287

1288
        case nir_op_fmul:
1289
                result = vir_FMUL(c, src[0], src[1]);
1290
                break;
1291
        case nir_op_fadd:
1292
                result = vir_FADD(c, src[0], src[1]);
1293
                break;
1294
        case nir_op_fsub:
1295
                result = vir_FSUB(c, src[0], src[1]);
1296
                break;
1297
        case nir_op_fmin:
1298
                result = vir_FMIN(c, src[0], src[1]);
1299
                break;
1300
        case nir_op_fmax:
1301
                result = vir_FMAX(c, src[0], src[1]);
1302
                break;
1303

1304
        case nir_op_f2i32: {
1305
                nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1306
                if (src0_alu && src0_alu->op == nir_op_fround_even) {
1307
                        result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1308
                } else {
1309
                        result = vir_FTOIZ(c, src[0]);
1310
                }
1311
                break;
1312
        }
1313

1314
        case nir_op_f2u32:
1315
                result = vir_FTOUZ(c, src[0]);
1316
                break;
1317
        case nir_op_i2f32:
1318
                result = vir_ITOF(c, src[0]);
1319
                break;
1320
        case nir_op_u2f32:
1321
                result = vir_UTOF(c, src[0]);
1322
                break;
1323
        case nir_op_b2f32:
1324
                result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1325
                break;
1326
        case nir_op_b2i32:
1327
                result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1328
                break;
1329

1330
        case nir_op_iadd:
1331
                result = vir_ADD(c, src[0], src[1]);
1332
                break;
1333
        case nir_op_ushr:
1334
                result = vir_SHR(c, src[0], src[1]);
1335
                break;
1336
        case nir_op_isub:
1337
                result = vir_SUB(c, src[0], src[1]);
1338
                break;
1339
        case nir_op_ishr:
1340
                result = vir_ASR(c, src[0], src[1]);
1341
                break;
1342
        case nir_op_ishl:
1343
                result = vir_SHL(c, src[0], src[1]);
1344
                break;
1345
        case nir_op_imin:
1346
                result = vir_MIN(c, src[0], src[1]);
1347
                break;
1348
        case nir_op_umin:
1349
                result = vir_UMIN(c, src[0], src[1]);
1350
                break;
1351
        case nir_op_imax:
1352
                result = vir_MAX(c, src[0], src[1]);
1353
                break;
1354
        case nir_op_umax:
1355
                result = vir_UMAX(c, src[0], src[1]);
1356
                break;
1357
        case nir_op_iand:
1358
                result = vir_AND(c, src[0], src[1]);
1359
                break;
1360
        case nir_op_ior:
1361
                result = vir_OR(c, src[0], src[1]);
1362
                break;
1363
        case nir_op_ixor:
1364
                result = vir_XOR(c, src[0], src[1]);
1365
                break;
1366
        case nir_op_inot:
1367
                result = vir_NOT(c, src[0]);
1368
                break;
1369

1370
        case nir_op_ufind_msb:
1371
                result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1372
                break;
1373

1374
        case nir_op_imul:
1375
                result = vir_UMUL(c, src[0], src[1]);
1376
                break;
1377

1378
        case nir_op_seq:
1379
        case nir_op_sne:
1380
        case nir_op_sge:
1381
        case nir_op_slt: {
1382
                enum v3d_qpu_cond cond;
1383
                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1384
                assert(ok);
1385
                result = vir_MOV(c, vir_SEL(c, cond,
1386
                                            vir_uniform_f(c, 1.0),
1387
                                            vir_uniform_f(c, 0.0)));
1388
                c->flags_temp = result.index;
1389
                c->flags_cond = cond;
1390
                break;
1391
        }
1392

1393
        case nir_op_i2b32:
1394
        case nir_op_f2b32:
1395
        case nir_op_feq32:
1396
        case nir_op_fneu32:
1397
        case nir_op_fge32:
1398
        case nir_op_flt32:
1399
        case nir_op_ieq32:
1400
        case nir_op_ine32:
1401
        case nir_op_ige32:
1402
        case nir_op_uge32:
1403
        case nir_op_ilt32:
1404
        case nir_op_ult32: {
1405
                enum v3d_qpu_cond cond;
1406
                ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1407
                assert(ok);
1408
                result = ntq_emit_cond_to_bool(c, cond);
1409
                break;
1410
        }
1411

1412
        case nir_op_b32csel:
1413
                result = vir_MOV(c,
1414
                                 vir_SEL(c,
1415
                                         ntq_emit_bool_to_cond(c, instr->src[0].src),
1416
                                         src[1], src[2]));
1417
                break;
1418

1419
        case nir_op_fcsel:
1420
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1421
                           V3D_QPU_PF_PUSHZ);
1422
                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1423
                                            src[1], src[2]));
1424
                break;
1425

1426
        case nir_op_frcp:
1427
                result = vir_RECIP(c, src[0]);
1428
                break;
1429
        case nir_op_frsq:
1430
                result = vir_RSQRT(c, src[0]);
1431
                break;
1432
        case nir_op_fexp2:
1433
                result = vir_EXP(c, src[0]);
1434
                break;
1435
        case nir_op_flog2:
1436
                result = vir_LOG(c, src[0]);
1437
                break;
1438

1439
        case nir_op_fceil:
1440
                result = vir_FCEIL(c, src[0]);
1441
                break;
1442
        case nir_op_ffloor:
1443
                result = vir_FFLOOR(c, src[0]);
1444
                break;
1445
        case nir_op_fround_even:
1446
                result = vir_FROUND(c, src[0]);
1447
                break;
1448
        case nir_op_ftrunc:
1449
                result = vir_FTRUNC(c, src[0]);
1450
                break;
1451

1452
        case nir_op_fsin:
1453
                result = ntq_fsincos(c, src[0], false);
1454
                break;
1455
        case nir_op_fcos:
1456
                result = ntq_fsincos(c, src[0], true);
1457
                break;
1458

1459
        case nir_op_fsign:
1460
                result = ntq_fsign(c, src[0]);
1461
                break;
1462

1463
        case nir_op_fabs: {
1464
                result = vir_FMOV(c, src[0]);
1465
                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1466
                break;
1467
        }
1468

1469
        case nir_op_iabs:
1470
                result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1471
                break;
1472

1473
        case nir_op_fddx:
1474
        case nir_op_fddx_coarse:
1475
        case nir_op_fddx_fine:
1476
                result = vir_FDX(c, src[0]);
1477
                break;
1478

1479
        case nir_op_fddy:
1480
        case nir_op_fddy_coarse:
1481
        case nir_op_fddy_fine:
1482
                result = vir_FDY(c, src[0]);
1483
                break;
1484

1485
        case nir_op_uadd_carry:
1486
                vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1487
                           V3D_QPU_PF_PUSHC);
1488
                result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
1489
                break;
1490

1491
        case nir_op_pack_half_2x16_split:
1492
                result = vir_VFPACK(c, src[0], src[1]);
1493
                break;
1494

1495
        case nir_op_unpack_half_2x16_split_x:
1496
                result = vir_FMOV(c, src[0]);
1497
                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1498
                break;
1499

1500
        case nir_op_unpack_half_2x16_split_y:
1501
                result = vir_FMOV(c, src[0]);
1502
                vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1503
                break;
1504

1505
        case nir_op_fquantize2f16: {
1506
                /* F32 -> F16 -> F32 conversion */
1507
                struct qreg tmp = vir_FMOV(c, src[0]);
1508
                vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
1509
                tmp = vir_FMOV(c, tmp);
1510
                vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
1511

1512
                /* Check for denorm */
1513
                struct qreg abs_src = vir_FMOV(c, src[0]);
1514
                vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
1515
                struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
1516
                vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
1517
                                         V3D_QPU_PF_PUSHC);
1518

1519
                /* Return +/-0 for denorms */
1520
                struct qreg zero =
1521
                        vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
1522
                result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
1523
                break;
1524
        }
1525

1526
        default:
1527
                fprintf(stderr, "unknown NIR ALU inst: ");
1528
                nir_print_instr(&instr->instr, stderr);
1529
                fprintf(stderr, "\n");
1530
                abort();
1531
        }
1532

1533
        /* We have a scalar result, so the instruction should only have a
1534
         * single channel written to.
1535
         */
1536
        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1537
        ntq_store_dest(c, &instr->dest.dest,
1538
                       ffs(instr->dest.write_mask) - 1, result);
1539
}
1540

1541
/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1542
 * specifier.  They come from a register that's preloaded with 0xffffffff
1543
 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1544
 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1545
 */
1546
#define TLB_TYPE_F16_COLOR         (3 << 6)
1547
#define TLB_TYPE_I32_COLOR         (1 << 6)
1548
#define TLB_TYPE_F32_COLOR         (0 << 6)
1549
#define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1550
#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1551
#define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1552
#define TLB_F16_SWAP_HI_LO         (1 << 1)
1553
#define TLB_VEC_SIZE_4_F16         (1 << 0)
1554
#define TLB_VEC_SIZE_2_F16         (0 << 0)
1555
#define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1556

1557
/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1558
 * flag is set.
1559
 */
1560
#define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1561
#define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1562
#define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1563
#define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1564
#define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1565

1566
/* Stencil is a single 32-bit write. */
1567
#define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1568

1569
static void
1570
vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1571
{
1572
        if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1573
                return;
1574

1575
        struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1576
        struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1577

1578
        nir_variable *var = c->output_color_var[rt];
1579
        int num_components = glsl_get_vector_elements(var->type);
1580
        uint32_t conf = 0xffffff00;
1581
        struct qinst *inst;
1582

1583
        conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1584
                                            TLB_SAMPLE_MODE_PER_PIXEL;
1585
        conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1586

1587
        if (c->fs_key->swap_color_rb & (1 << rt))
1588
                num_components = MAX2(num_components, 3);
1589
        assert(num_components != 0);
1590

1591
        enum glsl_base_type type = glsl_get_base_type(var->type);
1592
        bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1593
        bool is_32b_tlb_format = is_int_format ||
1594
                                 (c->fs_key->f32_color_rb & (1 << rt));
1595

1596
        if (is_int_format) {
1597
                /* The F32 vs I32 distinction was dropped in 4.2. */
1598
                if (c->devinfo->ver < 42)
1599
                        conf |= TLB_TYPE_I32_COLOR;
1600
                else
1601
                        conf |= TLB_TYPE_F32_COLOR;
1602
                conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1603
        } else {
1604
                if (c->fs_key->f32_color_rb & (1 << rt)) {
1605
                        conf |= TLB_TYPE_F32_COLOR;
1606
                        conf |= ((num_components - 1) <<
1607
                                TLB_VEC_SIZE_MINUS_1_SHIFT);
1608
                } else {
1609
                        conf |= TLB_TYPE_F16_COLOR;
1610
                        conf |= TLB_F16_SWAP_HI_LO;
1611
                        if (num_components >= 3)
1612
                                conf |= TLB_VEC_SIZE_4_F16;
1613
                        else
1614
                                conf |= TLB_VEC_SIZE_2_F16;
1615
                }
1616
        }
1617

1618
        int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1619
        for (int i = 0; i < num_samples; i++) {
1620
                struct qreg *color = c->msaa_per_sample_output ?
1621
                        &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1622
                        &c->outputs[var->data.driver_location * 4];
1623

1624
                struct qreg r = color[0];
1625
                struct qreg g = color[1];
1626
                struct qreg b = color[2];
1627
                struct qreg a = color[3];
1628

1629
                if (c->fs_key->swap_color_rb & (1 << rt))  {
1630
                        r = color[2];
1631
                        b = color[0];
1632
                }
1633

1634
                if (c->fs_key->sample_alpha_to_one)
1635
                        a = vir_uniform_f(c, 1.0);
1636

1637
                if (is_32b_tlb_format) {
1638
                        if (i == 0) {
1639
                                inst = vir_MOV_dest(c, tlbu_reg, r);
1640
                                inst->uniform =
1641
                                        vir_get_uniform_index(c,
1642
                                                              QUNIFORM_CONSTANT,
1643
                                                              conf);
1644
                        } else {
1645
                                vir_MOV_dest(c, tlb_reg, r);
1646
                        }
1647

1648
                        if (num_components >= 2)
1649
                                vir_MOV_dest(c, tlb_reg, g);
1650
                        if (num_components >= 3)
1651
                                vir_MOV_dest(c, tlb_reg, b);
1652
                        if (num_components >= 4)
1653
                                vir_MOV_dest(c, tlb_reg, a);
1654
                } else {
1655
                        inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1656
                        if (conf != ~0 && i == 0) {
1657
                                inst->dst = tlbu_reg;
1658
                                inst->uniform =
1659
                                        vir_get_uniform_index(c,
1660
                                                              QUNIFORM_CONSTANT,
1661
                                                              conf);
1662
                        }
1663

1664
                        if (num_components >= 3)
1665
                                vir_VFPACK_dest(c, tlb_reg, b, a);
1666
                }
1667
        }
1668
}
1669

1670
static void
1671
emit_frag_end(struct v3d_compile *c)
1672
{
1673
        if (c->output_sample_mask_index != -1) {
1674
                vir_SETMSF_dest(c, vir_nop_reg(),
1675
                                vir_AND(c,
1676
                                        vir_MSF(c),
1677
                                        c->outputs[c->output_sample_mask_index]));
1678
        }
1679

1680
        bool has_any_tlb_color_write = false;
1681
        for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1682
                if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1683
                        has_any_tlb_color_write = true;
1684
        }
1685

1686
        if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1687
                struct nir_variable *var = c->output_color_var[0];
1688
                struct qreg *color = &c->outputs[var->data.driver_location * 4];
1689

1690
                vir_SETMSF_dest(c, vir_nop_reg(),
1691
                                vir_AND(c,
1692
                                        vir_MSF(c),
1693
                                        vir_FTOC(c, color[3])));
1694
        }
1695

1696
        struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1697
        if (c->output_position_index != -1) {
1698
                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1699
                                                  c->outputs[c->output_position_index]);
1700
                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1701

1702
                if (c->devinfo->ver >= 42) {
1703
                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1704
                                          TLB_SAMPLE_MODE_PER_PIXEL);
1705
                } else
1706
                        tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1707

1708
                inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1709
                                                      tlb_specifier |
1710
                                                      0xffffff00);
1711
                c->writes_z = true;
1712
        } else if (c->s->info.fs.uses_discard ||
1713
                   !c->s->info.fs.early_fragment_tests ||
1714
                   c->fs_key->sample_alpha_to_coverage ||
1715
                   !has_any_tlb_color_write) {
1716
                /* Emit passthrough Z if it needed to be delayed until shader
1717
                 * end due to potential discards.
1718
                 *
1719
                 * Since (single-threaded) fragment shaders always need a TLB
1720
                 * write, emit passthrouh Z if we didn't have any color
1721
                 * buffers and flag us as potentially discarding, so that we
1722
                 * can use Z as the TLB write.
1723
                 */
1724
                c->s->info.fs.uses_discard = true;
1725

1726
                struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1727
                                                  vir_nop_reg());
1728
                uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1729

1730
                if (c->devinfo->ver >= 42) {
1731
                        /* The spec says the PER_PIXEL flag is ignored for
1732
                         * invariant writes, but the simulator demands it.
1733
                         */
1734
                        tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1735
                                          TLB_SAMPLE_MODE_PER_PIXEL);
1736
                } else {
1737
                        tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1738
                }
1739

1740
                inst->uniform = vir_get_uniform_index(c,
1741
                                                      QUNIFORM_CONSTANT,
1742
                                                      tlb_specifier |
1743
                                                      0xffffff00);
1744
                c->writes_z = true;
1745
        }
1746

1747
        /* XXX: Performance improvement: Merge Z write and color writes TLB
1748
         * uniform setup
1749
         */
1750
        for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
1751
                vir_emit_tlb_color_write(c, rt);
1752
}
1753

1754
static inline void
1755
vir_VPM_WRITE_indirect(struct v3d_compile *c,
1756
                       struct qreg val,
1757
                       struct qreg vpm_index,
1758
                       bool uniform_vpm_index)
1759
{
1760
        assert(c->devinfo->ver >= 40);
1761
        if (uniform_vpm_index)
1762
                vir_STVPMV(c, vpm_index, val);
1763
        else
1764
                vir_STVPMD(c, vpm_index, val);
1765
}
1766

1767
static void
1768
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
1769
{
1770
        if (c->devinfo->ver >= 40) {
1771
                vir_VPM_WRITE_indirect(c, val,
1772
                                       vir_uniform_ui(c, vpm_index), true);
1773
        } else {
1774
                /* XXX: v3d33_vir_vpm_write_setup(c); */
1775
                vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1776
        }
1777
}
1778

1779
static void
1780
emit_vert_end(struct v3d_compile *c)
1781
{
1782
        /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1783
         */
1784
        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1785
                vir_VPMWT(c);
1786
}
1787

1788
static void
1789
emit_geom_end(struct v3d_compile *c)
1790
{
1791
        /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1792
         */
1793
        if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1794
                vir_VPMWT(c);
1795
}
1796

1797
static bool
1798
mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
1799
                       unsigned bit_size,
1800
                       unsigned num_components,
1801
                       nir_intrinsic_instr *low,
1802
                       nir_intrinsic_instr *high,
1803
                       void *data)
1804
{
1805
        /* Our backend is 32-bit only at present */
1806
        if (bit_size != 32)
1807
                return false;
1808

1809
        if (align_mul % 4 != 0 || align_offset % 4 != 0)
1810
                return false;
1811

1812
        /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
1813
         * if the resulting vector crosses a 16-byte boundary.
1814
         */
1815
        assert(util_is_power_of_two_nonzero(align_mul));
1816
        align_mul = MIN2(align_mul, 16);
1817
        align_offset &= 0xf;
1818
        if (16 - align_mul + align_offset + num_components * 4 > 16)
1819
                return false;
1820

1821
        return true;
1822
}
1823

1824
void
1825
v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
1826
{
1827
        bool progress;
1828
        unsigned lower_flrp =
1829
                (s->options->lower_flrp16 ? 16 : 0) |
1830
                (s->options->lower_flrp32 ? 32 : 0) |
1831
                (s->options->lower_flrp64 ? 64 : 0);
1832

1833
        do {
1834
                progress = false;
1835

1836
                NIR_PASS_V(s, nir_lower_vars_to_ssa);
1837
                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1838
                NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1839
                NIR_PASS(progress, s, nir_copy_prop);
1840
                NIR_PASS(progress, s, nir_opt_remove_phis);
1841
                NIR_PASS(progress, s, nir_opt_dce);
1842
                NIR_PASS(progress, s, nir_opt_dead_cf);
1843
                NIR_PASS(progress, s, nir_opt_cse);
1844
                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1845
                NIR_PASS(progress, s, nir_opt_algebraic);
1846
                NIR_PASS(progress, s, nir_opt_constant_folding);
1847

1848
                nir_load_store_vectorize_options vectorize_opts = {
1849
                        .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
1850
                                 nir_var_mem_push_const | nir_var_mem_shared |
1851
                                 nir_var_mem_global,
1852
                        .callback = mem_vectorize_callback,
1853
                        .robust_modes = 0,
1854
                };
1855
                NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
1856

1857
                if (lower_flrp != 0) {
1858
                        bool lower_flrp_progress = false;
1859

1860
                        NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1861
                                 lower_flrp,
1862
                                 false /* always_precise */);
1863
                        if (lower_flrp_progress) {
1864
                                NIR_PASS(progress, s, nir_opt_constant_folding);
1865
                                progress = true;
1866
                        }
1867

1868
                        /* Nothing should rematerialize any flrps, so we only
1869
                         * need to do this lowering once.
1870
                         */
1871
                        lower_flrp = 0;
1872
                }
1873

1874
                NIR_PASS(progress, s, nir_opt_undef);
1875
                NIR_PASS(progress, s, nir_lower_undef_to_zero);
1876

1877
                if (c && !c->disable_loop_unrolling &&
1878
                    s->options->max_unroll_iterations > 0) {
1879
                       bool local_progress = false;
1880
                       NIR_PASS(local_progress, s, nir_opt_loop_unroll,
1881
                                nir_var_shader_in |
1882
                                nir_var_function_temp);
1883
                       c->unrolled_any_loops |= local_progress;
1884
                       progress |= local_progress;
1885
                }
1886
        } while (progress);
1887

1888
        nir_move_options sink_opts =
1889
                nir_move_const_undef | nir_move_comparisons | nir_move_copies |
1890
                nir_move_load_ubo;
1891
        NIR_PASS(progress, s, nir_opt_sink, sink_opts);
1892

1893
        NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
1894
}
1895

1896
static int
1897
driver_location_compare(const nir_variable *a, const nir_variable *b)
1898
{
1899
        return a->data.driver_location == b->data.driver_location ?
1900
               a->data.location_frac - b->data.location_frac :
1901
               a->data.driver_location - b->data.driver_location;
1902
}
1903

1904
static struct qreg
1905
ntq_emit_vpm_read(struct v3d_compile *c,
1906
                  uint32_t *num_components_queued,
1907
                  uint32_t *remaining,
1908
                  uint32_t vpm_index)
1909
{
1910
        struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1911

1912
        if (c->devinfo->ver >= 40 ) {
1913
                return vir_LDVPMV_IN(c,
1914
                                     vir_uniform_ui(c,
1915
                                                    (*num_components_queued)++));
1916
        }
1917

1918
        if (*num_components_queued != 0) {
1919
                (*num_components_queued)--;
1920
                return vir_MOV(c, vpm);
1921
        }
1922

1923
        uint32_t num_components = MIN2(*remaining, 32);
1924

1925
        v3d33_vir_vpm_read_setup(c, num_components);
1926

1927
        *num_components_queued = num_components - 1;
1928
        *remaining -= num_components;
1929

1930
        return vir_MOV(c, vpm);
1931
}
1932

1933
static void
1934
ntq_setup_vs_inputs(struct v3d_compile *c)
1935
{
1936
        /* Figure out how many components of each vertex attribute the shader
1937
         * uses.  Each variable should have been split to individual
1938
         * components and unused ones DCEed.  The vertex fetcher will load
1939
         * from the start of the attribute to the number of components we
1940
         * declare we need in c->vattr_sizes[].
1941
         *
1942
         * BGRA vertex attributes are a bit special: since we implement these
1943
         * as RGBA swapping R/B components we always need at least 3 components
1944
         * if component 0 is read.
1945
         */
1946
        nir_foreach_shader_in_variable(var, c->s) {
1947
                /* No VS attribute array support. */
1948
                assert(MAX2(glsl_get_length(var->type), 1) == 1);
1949

1950
                unsigned loc = var->data.driver_location;
1951
                int start_component = var->data.location_frac;
1952
                int num_components = glsl_get_components(var->type);
1953

1954
                c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
1955
                                           start_component + num_components);
1956

1957
                /* Handle BGRA inputs */
1958
                if (start_component == 0 &&
1959
                    c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
1960
                        c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
1961
                }
1962
        }
1963

1964
        unsigned num_components = 0;
1965
        uint32_t vpm_components_queued = 0;
1966
        bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
1967
                                    SYSTEM_VALUE_INSTANCE_ID) ||
1968
                        BITSET_TEST(c->s->info.system_values_read,
1969
                                    SYSTEM_VALUE_INSTANCE_INDEX);
1970
        bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
1971
                                     SYSTEM_VALUE_BASE_INSTANCE);
1972
        bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
1973
                                    SYSTEM_VALUE_VERTEX_ID) ||
1974
                        BITSET_TEST(c->s->info.system_values_read,
1975
                                    SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
1976

1977
        num_components += uses_iid;
1978
        num_components += uses_biid;
1979
        num_components += uses_vid;
1980

1981
        for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
1982
                num_components += c->vattr_sizes[i];
1983

1984
        if (uses_iid) {
1985
                c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1986
                                           &num_components, ~0);
1987
        }
1988

1989
        if (uses_biid) {
1990
                c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
1991
                                            &num_components, ~0);
1992
        }
1993

1994
        if (uses_vid) {
1995
                c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1996
                                           &num_components, ~0);
1997
        }
1998

1999
        /* The actual loads will happen directly in nir_intrinsic_load_input
2000
         * on newer versions.
2001
         */
2002
        if (c->devinfo->ver >= 40)
2003
                return;
2004

2005
        for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
2006
                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2007
                                  (loc + 1) * 4);
2008

2009
                for (int i = 0; i < c->vattr_sizes[loc]; i++) {
2010
                        c->inputs[loc * 4 + i] =
2011
                                ntq_emit_vpm_read(c,
2012
                                                  &vpm_components_queued,
2013
                                                  &num_components,
2014
                                                  loc * 4 + i);
2015

2016
                }
2017
        }
2018

2019
        if (c->devinfo->ver >= 40) {
2020
                assert(vpm_components_queued == num_components);
2021
        } else {
2022
                assert(vpm_components_queued == 0);
2023
                assert(num_components == 0);
2024
        }
2025
}
2026

2027
static bool
2028
program_reads_point_coord(struct v3d_compile *c)
2029
{
2030
        nir_foreach_shader_in_variable(var, c->s) {
2031
                if (util_varying_is_point_coord(var->data.location,
2032
                                                c->fs_key->point_sprite_mask)) {
2033
                        return true;
2034
                }
2035
        }
2036

2037
        return false;
2038
}
2039

2040
static void
2041
ntq_setup_gs_inputs(struct v3d_compile *c)
2042
{
2043
        nir_sort_variables_with_modes(c->s, driver_location_compare,
2044
                                      nir_var_shader_in);
2045

2046
        nir_foreach_shader_in_variable(var, c->s) {
2047
                /* All GS inputs are arrays with as many entries as vertices
2048
                 * in the input primitive, but here we only care about the
2049
                 * per-vertex input type.
2050
                 */
2051
                assert(glsl_type_is_array(var->type));
2052
                const struct glsl_type *type = glsl_get_array_element(var->type);
2053
                unsigned array_len = MAX2(glsl_get_length(type), 1);
2054
                unsigned loc = var->data.driver_location;
2055

2056
                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2057
                                  (loc + array_len) * 4);
2058

2059
                if (var->data.compact) {
2060
                        for (unsigned j = 0; j < array_len; j++) {
2061
                                unsigned input_idx = c->num_inputs++;
2062
                                unsigned loc_frac = var->data.location_frac + j;
2063
                                unsigned loc = var->data.location + loc_frac / 4;
2064
                                unsigned comp = loc_frac % 4;
2065
                                c->input_slots[input_idx] =
2066
                                        v3d_slot_from_slot_and_component(loc, comp);
2067
                        }
2068
                       continue;
2069
                }
2070

2071
                for (unsigned j = 0; j < array_len; j++) {
2072
                        unsigned num_elements = glsl_get_vector_elements(type);
2073
                        for (unsigned k = 0; k < num_elements; k++) {
2074
                                unsigned chan = var->data.location_frac + k;
2075
                                unsigned input_idx = c->num_inputs++;
2076
                                struct v3d_varying_slot slot =
2077
                                        v3d_slot_from_slot_and_component(var->data.location + j, chan);
2078
                                c->input_slots[input_idx] = slot;
2079
                        }
2080
                }
2081
        }
2082
}
2083

2084

2085
static void
2086
ntq_setup_fs_inputs(struct v3d_compile *c)
2087
{
2088
        nir_sort_variables_with_modes(c->s, driver_location_compare,
2089
                                      nir_var_shader_in);
2090

2091
        nir_foreach_shader_in_variable(var, c->s) {
2092
                unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2093
                unsigned loc = var->data.driver_location;
2094

2095
                uint32_t inputs_array_size = c->inputs_array_size;
2096
                uint32_t inputs_array_required_size = (loc + var_len) * 4;
2097
                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2098
                                  inputs_array_required_size);
2099
                resize_interp_array(c, &c->interp, &inputs_array_size,
2100
                                    inputs_array_required_size);
2101

2102
                if (var->data.location == VARYING_SLOT_POS) {
2103
                        emit_fragcoord_input(c, loc);
2104
                } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2105
                           !c->fs_key->has_gs) {
2106
                        /* If the fragment shader reads gl_PrimitiveID and we
2107
                         * don't have a geometry shader in the pipeline to write
2108
                         * it then we program the hardware to inject it as
2109
                         * an implicit varying. Take it from there.
2110
                         */
2111
                        c->inputs[loc * 4] = c->primitive_id;
2112
                } else if (util_varying_is_point_coord(var->data.location,
2113
                                                       c->fs_key->point_sprite_mask)) {
2114
                        c->inputs[loc * 4 + 0] = c->point_x;
2115
                        c->inputs[loc * 4 + 1] = c->point_y;
2116
                } else if (var->data.compact) {
2117
                        for (int j = 0; j < var_len; j++)
2118
                                emit_compact_fragment_input(c, loc, var, j);
2119
                } else if (glsl_type_is_struct(var->type)) {
2120
                        for (int j = 0; j < var_len; j++) {
2121
                           emit_fragment_input(c, loc, var, j, 4);
2122
                        }
2123
                } else {
2124
                        for (int j = 0; j < var_len; j++) {
2125
                                emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2126
                        }
2127
                }
2128
        }
2129
}
2130

2131
static void
2132
ntq_setup_outputs(struct v3d_compile *c)
2133
{
2134
        if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2135
                return;
2136

2137
        nir_foreach_shader_out_variable(var, c->s) {
2138
                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
2139
                unsigned loc = var->data.driver_location * 4;
2140

2141
                assert(array_len == 1);
2142
                (void)array_len;
2143

2144
                for (int i = 0; i < 4 - var->data.location_frac; i++) {
2145
                        add_output(c, loc + var->data.location_frac + i,
2146
                                   var->data.location,
2147
                                   var->data.location_frac + i);
2148
                }
2149

2150
                switch (var->data.location) {
2151
                case FRAG_RESULT_COLOR:
2152
                        c->output_color_var[0] = var;
2153
                        c->output_color_var[1] = var;
2154
                        c->output_color_var[2] = var;
2155
                        c->output_color_var[3] = var;
2156
                        break;
2157
                case FRAG_RESULT_DATA0:
2158
                case FRAG_RESULT_DATA1:
2159
                case FRAG_RESULT_DATA2:
2160
                case FRAG_RESULT_DATA3:
2161
                        c->output_color_var[var->data.location -
2162
                                            FRAG_RESULT_DATA0] = var;
2163
                        break;
2164
                case FRAG_RESULT_DEPTH:
2165
                        c->output_position_index = loc;
2166
                        break;
2167
                case FRAG_RESULT_SAMPLE_MASK:
2168
                        c->output_sample_mask_index = loc;
2169
                        break;
2170
                }
2171
        }
2172
}
2173

2174
/**
2175
 * Sets up the mapping from nir_register to struct qreg *.
2176
 *
2177
 * Each nir_register gets a struct qreg per 32-bit component being stored.
2178
 */
2179
static void
2180
ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
2181
{
2182
        foreach_list_typed(nir_register, nir_reg, node, list) {
2183
                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
2184
                struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2185
                                                  array_len *
2186
                                                  nir_reg->num_components);
2187

2188
                _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2189

2190
                for (int i = 0; i < array_len * nir_reg->num_components; i++)
2191
                        qregs[i] = vir_get_temp(c);
2192
        }
2193
}
2194

2195
static void
2196
ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2197
{
2198
        /* XXX perf: Experiment with using immediate loads to avoid having
2199
         * these end up in the uniform stream.  Watch out for breaking the
2200
         * small immediates optimization in the process!
2201
         */
2202
        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2203
        for (int i = 0; i < instr->def.num_components; i++)
2204
                qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2205

2206
        _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2207
}
2208

2209
static void
2210
ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2211
{
2212
        unsigned image_index = nir_src_as_uint(instr->src[0]);
2213
        bool is_array = nir_intrinsic_image_array(instr);
2214

2215
        assert(nir_src_as_uint(instr->src[1]) == 0);
2216

2217
        ntq_store_dest(c, &instr->dest, 0,
2218
                       vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2219
        if (instr->num_components > 1) {
2220
                ntq_store_dest(c, &instr->dest, 1,
2221
                               vir_uniform(c,
2222
                                           instr->num_components == 2 && is_array ?
2223
                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
2224
                                                   QUNIFORM_IMAGE_HEIGHT,
2225
                                           image_index));
2226
        }
2227
        if (instr->num_components > 2) {
2228
                ntq_store_dest(c, &instr->dest, 2,
2229
                               vir_uniform(c,
2230
                                           is_array ?
2231
                                           QUNIFORM_IMAGE_ARRAY_SIZE :
2232
                                           QUNIFORM_IMAGE_DEPTH,
2233
                                           image_index));
2234
        }
2235
}
2236

2237
static void
2238
vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2239
{
2240
        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2241

2242
        int rt = nir_src_as_uint(instr->src[0]);
2243
        assert(rt < V3D_MAX_DRAW_BUFFERS);
2244

2245
        int sample_index = nir_intrinsic_base(instr) ;
2246
        assert(sample_index < V3D_MAX_SAMPLES);
2247

2248
        int component = nir_intrinsic_component(instr);
2249
        assert(component < 4);
2250

2251
        /* We need to emit our TLB reads after we have acquired the scoreboard
2252
         * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2253
         * the last thread switch to improve parallelism, however, that is only
2254
         * guaranteed to happen before the tlb color writes.
2255
         *
2256
         * To fix that, we make sure we always emit a thread switch before the
2257
         * first tlb color read. If that happens to be the last thread switch
2258
         * we emit, then everything is fine, but otherwsie, if any code after
2259
         * this point needs to emit additional thread switches, then we will
2260
         * switch the strategy to locking the scoreboard on the first thread
2261
         * switch instead -- see vir_emit_thrsw().
2262
         */
2263
        if (!c->emitted_tlb_load) {
2264
                if (!c->last_thrsw_at_top_level) {
2265
                        assert(c->devinfo->ver >= 41);
2266
                        vir_emit_thrsw(c);
2267
                }
2268

2269
                c->emitted_tlb_load = true;
2270
        }
2271

2272
        struct qreg *color_reads_for_sample =
2273
                &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2274

2275
        if (color_reads_for_sample[component].file == QFILE_NULL) {
2276
                enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2277
                int num_components =
2278
                        util_format_get_nr_components(rt_format);
2279

2280
                const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2281
                if (swap_rb)
2282
                        num_components = MAX2(num_components, 3);
2283

2284
                nir_variable *var = c->output_color_var[rt];
2285
                enum glsl_base_type type = glsl_get_base_type(var->type);
2286

2287
                bool is_int_format = type == GLSL_TYPE_INT ||
2288
                                     type == GLSL_TYPE_UINT;
2289

2290
                bool is_32b_tlb_format = is_int_format ||
2291
                                         (c->fs_key->f32_color_rb & (1 << rt));
2292

2293
                int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2294

2295
                uint32_t conf = 0xffffff00;
2296
                conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2297
                                          TLB_SAMPLE_MODE_PER_PIXEL;
2298
                conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2299

2300
                if (is_32b_tlb_format) {
2301
                        /* The F32 vs I32 distinction was dropped in 4.2. */
2302
                        conf |= (c->devinfo->ver < 42 && is_int_format) ?
2303
                                TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2304

2305
                        conf |= ((num_components - 1) <<
2306
                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
2307
                } else {
2308
                        conf |= TLB_TYPE_F16_COLOR;
2309
                        conf |= TLB_F16_SWAP_HI_LO;
2310

2311
                        if (num_components >= 3)
2312
                                conf |= TLB_VEC_SIZE_4_F16;
2313
                        else
2314
                                conf |= TLB_VEC_SIZE_2_F16;
2315
                }
2316

2317

2318
                for (int i = 0; i < num_samples; i++) {
2319
                        struct qreg r, g, b, a;
2320
                        if (is_32b_tlb_format) {
2321
                                r = conf != 0xffffffff && i == 0?
2322
                                        vir_TLBU_COLOR_READ(c, conf) :
2323
                                        vir_TLB_COLOR_READ(c);
2324
                                if (num_components >= 2)
2325
                                        g = vir_TLB_COLOR_READ(c);
2326
                                if (num_components >= 3)
2327
                                        b = vir_TLB_COLOR_READ(c);
2328
                                if (num_components >= 4)
2329
                                        a = vir_TLB_COLOR_READ(c);
2330
                        } else {
2331
                                struct qreg rg = conf != 0xffffffff && i == 0 ?
2332
                                        vir_TLBU_COLOR_READ(c, conf) :
2333
                                        vir_TLB_COLOR_READ(c);
2334
                                r = vir_FMOV(c, rg);
2335
                                vir_set_unpack(c->defs[r.index], 0,
2336
                                               V3D_QPU_UNPACK_L);
2337
                                g = vir_FMOV(c, rg);
2338
                                vir_set_unpack(c->defs[g.index], 0,
2339
                                               V3D_QPU_UNPACK_H);
2340

2341
                                if (num_components > 2) {
2342
                                    struct qreg ba = vir_TLB_COLOR_READ(c);
2343
                                    b = vir_FMOV(c, ba);
2344
                                    vir_set_unpack(c->defs[b.index], 0,
2345
                                                   V3D_QPU_UNPACK_L);
2346
                                    a = vir_FMOV(c, ba);
2347
                                    vir_set_unpack(c->defs[a.index], 0,
2348
                                                   V3D_QPU_UNPACK_H);
2349
                                }
2350
                        }
2351

2352
                        struct qreg *color_reads =
2353
                                &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2354

2355
                        color_reads[0] = swap_rb ? b : r;
2356
                        if (num_components >= 2)
2357
                                color_reads[1] = g;
2358
                        if (num_components >= 3)
2359
                                color_reads[2] = swap_rb ? r : b;
2360
                        if (num_components >= 4)
2361
                                color_reads[3] = a;
2362
                }
2363
        }
2364

2365
        assert(color_reads_for_sample[component].file != QFILE_NULL);
2366
        ntq_store_dest(c, &instr->dest, 0,
2367
                       vir_MOV(c, color_reads_for_sample[component]));
2368
}
2369

2370
static void
2371
ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2372
{
2373
        if (nir_src_is_const(instr->src[0])) {
2374
                int offset = (nir_intrinsic_base(instr) +
2375
                             nir_src_as_uint(instr->src[0]));
2376
                assert(offset % 4 == 0);
2377
                /* We need dwords */
2378
                offset = offset / 4;
2379
                for (int i = 0; i < instr->num_components; i++) {
2380
                        ntq_store_dest(c, &instr->dest, i,
2381
                                       vir_uniform(c, QUNIFORM_UNIFORM,
2382
                                                   offset + i));
2383
                }
2384
        } else {
2385
               ntq_emit_tmu_general(c, instr, false);
2386
        }
2387
}
2388

2389
static void
2390
ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2391
{
2392
        /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2393
         *
2394
         * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2395
         * if we don't support non-uniform offsets because we also set the
2396
         * lower_all_io_to_temps option in the NIR compiler. This ensures that
2397
         * any indirect indexing on in/out variables is turned into indirect
2398
         * indexing on temporary variables instead, that we handle by lowering
2399
         * to scratch. If we implement non-uniform offset here we might be able
2400
         * to avoid the temp and scratch lowering, which involves copying from
2401
         * the input to the temp variable, possibly making code more optimal.
2402
         */
2403
        unsigned offset =
2404
                nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2405

2406
        if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
2407
               /* Emit the LDVPM directly now, rather than at the top
2408
                * of the shader like we did for V3D 3.x (which needs
2409
                * vpmsetup when not just taking the next offset).
2410
                *
2411
                * Note that delaying like this may introduce stalls,
2412
                * as LDVPMV takes a minimum of 1 instruction but may
2413
                * be slower if the VPM unit is busy with another QPU.
2414
                */
2415
               int index = 0;
2416
               if (BITSET_TEST(c->s->info.system_values_read,
2417
                               SYSTEM_VALUE_INSTANCE_ID)) {
2418
                      index++;
2419
               }
2420
               if (BITSET_TEST(c->s->info.system_values_read,
2421
                               SYSTEM_VALUE_BASE_INSTANCE)) {
2422
                      index++;
2423
               }
2424
               if (BITSET_TEST(c->s->info.system_values_read,
2425
                               SYSTEM_VALUE_VERTEX_ID)) {
2426
                      index++;
2427
               }
2428
               for (int i = 0; i < offset; i++)
2429
                      index += c->vattr_sizes[i];
2430
               index += nir_intrinsic_component(instr);
2431
               for (int i = 0; i < instr->num_components; i++) {
2432
                      struct qreg vpm_offset = vir_uniform_ui(c, index++);
2433
                      ntq_store_dest(c, &instr->dest, i,
2434
                                     vir_LDVPMV_IN(c, vpm_offset));
2435
                }
2436
        } else {
2437
                for (int i = 0; i < instr->num_components; i++) {
2438
                        int comp = nir_intrinsic_component(instr) + i;
2439
                        ntq_store_dest(c, &instr->dest, i,
2440
                                       vir_MOV(c, c->inputs[offset * 4 + comp]));
2441
                }
2442
        }
2443
}
2444

2445
static void
2446
ntq_emit_per_sample_color_write(struct v3d_compile *c,
2447
                                nir_intrinsic_instr *instr)
2448
{
2449
        assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2450

2451
        unsigned rt = nir_src_as_uint(instr->src[1]);
2452
        assert(rt < V3D_MAX_DRAW_BUFFERS);
2453

2454
        unsigned sample_idx = nir_intrinsic_base(instr);
2455
        assert(sample_idx < V3D_MAX_SAMPLES);
2456

2457
        unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2458
        for (int i = 0; i < instr->num_components; i++) {
2459
                c->sample_colors[offset + i] =
2460
                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2461
        }
2462
}
2463

2464
static void
2465
ntq_emit_color_write(struct v3d_compile *c,
2466
                     nir_intrinsic_instr *instr)
2467
{
2468
        unsigned offset = (nir_intrinsic_base(instr) +
2469
                           nir_src_as_uint(instr->src[1])) * 4 +
2470
                          nir_intrinsic_component(instr);
2471
        for (int i = 0; i < instr->num_components; i++) {
2472
                c->outputs[offset + i] =
2473
                        vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2474
        }
2475
}
2476

2477
static void
2478
emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2479
{
2480
        assert(instr->num_components == 1);
2481

2482
        struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2483

2484
        uint32_t base_offset = nir_intrinsic_base(instr);
2485

2486
        if (base_offset)
2487
                offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2488

2489
        /* Usually, for VS or FS, we only emit outputs once at program end so
2490
         * our VPM writes are never in non-uniform control flow, but this
2491
         * is not true for GS, where we are emitting multiple vertices.
2492
         */
2493
        if (vir_in_nonuniform_control_flow(c)) {
2494
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2495
                           V3D_QPU_PF_PUSHZ);
2496
        }
2497

2498
        struct qreg val = ntq_get_src(c, instr->src[0], 0);
2499

2500
        /* The offset isn’t necessarily dynamically uniform for a geometry
2501
         * shader. This can happen if the shader sometimes doesn’t emit one of
2502
         * the vertices. In that case subsequent vertices will be written to
2503
         * different offsets in the VPM and we need to use the scatter write
2504
         * instruction to have a different offset for each lane.
2505
         */
2506
         bool is_uniform_offset =
2507
                 !vir_in_nonuniform_control_flow(c) &&
2508
                 !nir_src_is_divergent(instr->src[1]);
2509
         vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2510

2511
        if (vir_in_nonuniform_control_flow(c)) {
2512
                struct qinst *last_inst =
2513
                        (struct qinst *)c->cur_block->instructions.prev;
2514
                vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2515
        }
2516
}
2517

2518
static void
2519
emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2520
{
2521
        assert(c->s->info.stage == MESA_SHADER_VERTEX);
2522
        assert(instr->num_components == 1);
2523

2524
        uint32_t base = nir_intrinsic_base(instr);
2525
        struct qreg val = ntq_get_src(c, instr->src[0], 0);
2526

2527
        if (nir_src_is_const(instr->src[1])) {
2528
                vir_VPM_WRITE(c, val,
2529
                              base + nir_src_as_uint(instr->src[1]));
2530
        } else {
2531
                struct qreg offset = vir_ADD(c,
2532
                                             ntq_get_src(c, instr->src[1], 1),
2533
                                             vir_uniform_ui(c, base));
2534
                bool is_uniform_offset =
2535
                        !vir_in_nonuniform_control_flow(c) &&
2536
                        !nir_src_is_divergent(instr->src[1]);
2537
                vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2538
        }
2539
}
2540

2541
static void
2542
ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2543
{
2544
        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2545
               ntq_emit_color_write(c, instr);
2546
        else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2547
               emit_store_output_gs(c, instr);
2548
        else
2549
               emit_store_output_vs(c, instr);
2550
}
2551

2552
/**
2553
 * This implementation is based on v3d_sample_{x,y}_offset() from
2554
 * v3d_sample_offset.h.
2555
 */
2556
static void
2557
ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2558
                      struct qreg *sx, struct qreg *sy)
2559
{
2560
        sample_idx = vir_ITOF(c, sample_idx);
2561

2562
        struct qreg offset_x =
2563
                vir_FADD(c, vir_uniform_f(c, -0.125f),
2564
                            vir_FMUL(c, sample_idx,
2565
                                        vir_uniform_f(c, 0.5f)));
2566
        vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2567
                                    vir_uniform_f(c, 2.0f), sample_idx),
2568
                   V3D_QPU_PF_PUSHC);
2569
        offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2570
                              vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2571
                              offset_x);
2572

2573
        struct qreg offset_y =
2574
                   vir_FADD(c, vir_uniform_f(c, -0.375f),
2575
                               vir_FMUL(c, sample_idx,
2576
                                           vir_uniform_f(c, 0.25f)));
2577
        *sx = offset_x;
2578
        *sy = offset_y;
2579
}
2580

2581
/**
2582
 * This implementation is based on get_centroid_offset() from fep.c.
2583
 */
2584
static void
2585
ntq_get_barycentric_centroid(struct v3d_compile *c,
2586
                             struct qreg *out_x,
2587
                             struct qreg *out_y)
2588
{
2589
        struct qreg sample_mask;
2590
        if (c->output_sample_mask_index != -1)
2591
                sample_mask = c->outputs[c->output_sample_mask_index];
2592
        else
2593
                sample_mask = vir_MSF(c);
2594

2595
        struct qreg i0 = vir_uniform_ui(c, 0);
2596
        struct qreg i1 = vir_uniform_ui(c, 1);
2597
        struct qreg i2 = vir_uniform_ui(c, 2);
2598
        struct qreg i3 = vir_uniform_ui(c, 3);
2599
        struct qreg i4 = vir_uniform_ui(c, 4);
2600
        struct qreg i8 = vir_uniform_ui(c, 8);
2601

2602
        /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2603
        struct qreg F = vir_uniform_ui(c, 0);
2604
        struct qreg T = vir_uniform_ui(c, ~0);
2605
        struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
2606
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2607
        s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2608
        struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
2609
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2610
        s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2611
        struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
2612
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2613
        s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2614
        struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
2615
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2616
        s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2617

2618
        /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2619
        struct qreg sample_idx = i3;
2620
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2621
        sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2622
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2623
        sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2624
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2625
        sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2626

2627
        /* Get offset at selected sample index */
2628
        struct qreg offset_x, offset_y;
2629
        ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2630

2631
        /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2632
         * are selected.
2633
         */
2634
        struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2635
        struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2636

2637
        struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2638
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2639
        use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2640
        use_center = vir_OR(c, use_center, s0_and_s3);
2641
        use_center = vir_OR(c, use_center, s1_and_s2);
2642

2643
        struct qreg zero = vir_uniform_f(c, 0.0f);
2644
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2645
        offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
2646
        offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
2647

2648
        *out_x = offset_x;
2649
        *out_y = offset_y;
2650
}
2651

2652
static struct qreg
2653
ntq_emit_load_interpolated_input(struct v3d_compile *c,
2654
                                 struct qreg p,
2655
                                 struct qreg C,
2656
                                 struct qreg offset_x,
2657
                                 struct qreg offset_y,
2658
                                 unsigned mode)
2659
{
2660
        if (mode == INTERP_MODE_FLAT)
2661
                return C;
2662

2663
        struct qreg sample_offset_x =
2664
                vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2665
        struct qreg sample_offset_y =
2666
                vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2667

2668
        struct qreg scaleX =
2669
                vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
2670
                            offset_x);
2671
        struct qreg scaleY =
2672
                vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
2673
                            offset_y);
2674

2675
        struct qreg pInterp =
2676
                vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
2677
                                           vir_FMUL(c, vir_FDY(c, p), scaleY)));
2678

2679
        if (mode == INTERP_MODE_NOPERSPECTIVE)
2680
                return vir_FADD(c, pInterp, C);
2681

2682
        struct qreg w = c->payload_w;
2683
        struct qreg wInterp =
2684
                vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
2685
                                           vir_FMUL(c, vir_FDY(c, w), scaleY)));
2686

2687
        return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
2688
}
2689

2690
static void
2691
emit_ldunifa(struct v3d_compile *c, struct qreg *result)
2692
{
2693
        struct qinst *ldunifa =
2694
                vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
2695
        ldunifa->qpu.sig.ldunifa = true;
2696
        if (result)
2697
                *result = vir_emit_def(c, ldunifa);
2698
        else
2699
                vir_emit_nondef(c, ldunifa);
2700
        c->current_unifa_offset += 4;
2701
}
2702

2703
static void
2704
ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
2705
{
2706
        /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
2707
         * current unifa offset is 4 bytes ahead of the offset of the last load.
2708
         */
2709
        static const int32_t max_unifa_skip_dist =
2710
                MAX_UNIFA_SKIP_DISTANCE - 4;
2711

2712
        bool dynamic_src = !nir_src_is_const(instr->src[1]);
2713
        uint32_t const_offset =
2714
                dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
2715

2716
        /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
2717
         * shifted up by 1 (0 is gallium's constant buffer 0).
2718
         */
2719
        uint32_t index = nir_src_as_uint(instr->src[0]);
2720
        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
2721
                index++;
2722

2723
        /* We can only keep track of the last unifa address we used with
2724
         * constant offset loads. If the new load targets the same UBO and
2725
         * is close enough to the previous load, we can skip the unifa register
2726
         * write by emitting dummy ldunifa instructions to update the unifa
2727
         * address.
2728
         */
2729
        bool skip_unifa = false;
2730
        uint32_t ldunifa_skips = 0;
2731
        if (dynamic_src) {
2732
                c->current_unifa_block = NULL;
2733
        } else if (c->cur_block == c->current_unifa_block &&
2734
                   c->current_unifa_index == index &&
2735
                   c->current_unifa_offset <= const_offset &&
2736
                   c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
2737
                skip_unifa = true;
2738
                ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
2739
        } else {
2740
                c->current_unifa_block = c->cur_block;
2741
                c->current_unifa_index = index;
2742
                c->current_unifa_offset = const_offset;
2743
        }
2744

2745
        if (!skip_unifa) {
2746
                struct qreg base_offset =
2747
                        vir_uniform(c, QUNIFORM_UBO_ADDR,
2748
                                    v3d_unit_data_create(index, const_offset));
2749

2750
                struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
2751
                if (!dynamic_src) {
2752
                        vir_MOV_dest(c, unifa, base_offset);
2753
                } else {
2754
                        vir_ADD_dest(c, unifa, base_offset,
2755
                                     ntq_get_src(c, instr->src[1], 0));
2756
                }
2757
        } else {
2758
                for (int i = 0; i < ldunifa_skips; i++)
2759
                        emit_ldunifa(c, NULL);
2760
        }
2761

2762
        for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2763
                struct qreg data;
2764
                emit_ldunifa(c, &data);
2765
                ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
2766
        }
2767
}
2768

2769
static inline struct qreg
2770
emit_load_local_invocation_index(struct v3d_compile *c)
2771
{
2772
        return vir_SHR(c, c->cs_payload[1],
2773
                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
2774
}
2775

2776
/* Various subgroup operations rely on the A flags, so this helper ensures that
2777
 * A flags represents currently active lanes in the subgroup.
2778
 */
2779
static void
2780
set_a_flags_for_subgroup(struct v3d_compile *c)
2781
{
2782
        /* MSF returns 0 for disabled lanes in compute shaders so
2783
         * PUSHZ will set A=1 for disabled lanes. We want the inverse
2784
         * of this but we don't have any means to negate the A flags
2785
         * directly, but we can do it by repeating the same operation
2786
         * with NORZ (A = ~A & ~Z).
2787
         */
2788
        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
2789
        vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2790
        vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
2791

2792
        /* If we are under non-uniform control flow we also need to
2793
         * AND the A flags with the current execute mask.
2794
         */
2795
        if (vir_in_nonuniform_control_flow(c)) {
2796
                const uint32_t bidx = c->cur_block->index;
2797
                vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
2798
                                           c->execute,
2799
                                           vir_uniform_ui(c, bidx)),
2800
                           V3D_QPU_UF_ANDZ);
2801
        }
2802
}
2803

2804
static void
2805
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
2806
{
2807
        switch (instr->intrinsic) {
2808
        case nir_intrinsic_load_uniform:
2809
                ntq_emit_load_uniform(c, instr);
2810
                break;
2811

2812
        case nir_intrinsic_load_ubo:
2813
                if (!nir_src_is_divergent(instr->src[1]))
2814
                        ntq_emit_load_ubo_unifa(c, instr);
2815
                else
2816
                        ntq_emit_tmu_general(c, instr, false);
2817
                break;
2818

2819
        case nir_intrinsic_ssbo_atomic_add:
2820
        case nir_intrinsic_ssbo_atomic_imin:
2821
        case nir_intrinsic_ssbo_atomic_umin:
2822
        case nir_intrinsic_ssbo_atomic_imax:
2823
        case nir_intrinsic_ssbo_atomic_umax:
2824
        case nir_intrinsic_ssbo_atomic_and:
2825
        case nir_intrinsic_ssbo_atomic_or:
2826
        case nir_intrinsic_ssbo_atomic_xor:
2827
        case nir_intrinsic_ssbo_atomic_exchange:
2828
        case nir_intrinsic_ssbo_atomic_comp_swap:
2829
        case nir_intrinsic_load_ssbo:
2830
        case nir_intrinsic_store_ssbo:
2831
                ntq_emit_tmu_general(c, instr, false);
2832
                break;
2833

2834
        case nir_intrinsic_shared_atomic_add:
2835
        case nir_intrinsic_shared_atomic_imin:
2836
        case nir_intrinsic_shared_atomic_umin:
2837
        case nir_intrinsic_shared_atomic_imax:
2838
        case nir_intrinsic_shared_atomic_umax:
2839
        case nir_intrinsic_shared_atomic_and:
2840
        case nir_intrinsic_shared_atomic_or:
2841
        case nir_intrinsic_shared_atomic_xor:
2842
        case nir_intrinsic_shared_atomic_exchange:
2843
        case nir_intrinsic_shared_atomic_comp_swap:
2844
        case nir_intrinsic_load_shared:
2845
        case nir_intrinsic_store_shared:
2846
        case nir_intrinsic_load_scratch:
2847
        case nir_intrinsic_store_scratch:
2848
                ntq_emit_tmu_general(c, instr, true);
2849
                break;
2850

2851
        case nir_intrinsic_image_load:
2852
        case nir_intrinsic_image_store:
2853
        case nir_intrinsic_image_atomic_add:
2854
        case nir_intrinsic_image_atomic_imin:
2855
        case nir_intrinsic_image_atomic_umin:
2856
        case nir_intrinsic_image_atomic_imax:
2857
        case nir_intrinsic_image_atomic_umax:
2858
        case nir_intrinsic_image_atomic_and:
2859
        case nir_intrinsic_image_atomic_or:
2860
        case nir_intrinsic_image_atomic_xor:
2861
        case nir_intrinsic_image_atomic_exchange:
2862
        case nir_intrinsic_image_atomic_comp_swap:
2863
                v3d40_vir_emit_image_load_store(c, instr);
2864
                break;
2865

2866
        case nir_intrinsic_get_ssbo_size:
2867
                ntq_store_dest(c, &instr->dest, 0,
2868
                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
2869
                                           nir_src_comp_as_uint(instr->src[0], 0)));
2870
                break;
2871

2872
        case nir_intrinsic_get_ubo_size:
2873
                ntq_store_dest(c, &instr->dest, 0,
2874
                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
2875
                                           nir_src_comp_as_uint(instr->src[0], 0)));
2876
                break;
2877

2878
        case nir_intrinsic_load_user_clip_plane:
2879
                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2880
                        ntq_store_dest(c, &instr->dest, i,
2881
                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
2882
                                                   nir_intrinsic_ucp_id(instr) *
2883
                                                   4 + i));
2884
                }
2885
                break;
2886

2887
        case nir_intrinsic_load_viewport_x_scale:
2888
                ntq_store_dest(c, &instr->dest, 0,
2889
                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
2890
                break;
2891

2892
        case nir_intrinsic_load_viewport_y_scale:
2893
                ntq_store_dest(c, &instr->dest, 0,
2894
                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
2895
                break;
2896

2897
        case nir_intrinsic_load_viewport_z_scale:
2898
                ntq_store_dest(c, &instr->dest, 0,
2899
                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
2900
                break;
2901

2902
        case nir_intrinsic_load_viewport_z_offset:
2903
                ntq_store_dest(c, &instr->dest, 0,
2904
                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
2905
                break;
2906

2907
        case nir_intrinsic_load_line_coord:
2908
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
2909
                break;
2910

2911
        case nir_intrinsic_load_line_width:
2912
                ntq_store_dest(c, &instr->dest, 0,
2913
                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
2914
                break;
2915

2916
        case nir_intrinsic_load_aa_line_width:
2917
                ntq_store_dest(c, &instr->dest, 0,
2918
                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
2919
                break;
2920

2921
        case nir_intrinsic_load_sample_mask_in:
2922
                ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
2923
                break;
2924

2925
        case nir_intrinsic_load_helper_invocation:
2926
                vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2927
                struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
2928
                ntq_store_dest(c, &instr->dest, 0, qdest);
2929
                break;
2930

2931
        case nir_intrinsic_load_front_face:
2932
                /* The register contains 0 (front) or 1 (back), and we need to
2933
                 * turn it into a NIR bool where true means front.
2934
                 */
2935
                ntq_store_dest(c, &instr->dest, 0,
2936
                               vir_ADD(c,
2937
                                       vir_uniform_ui(c, -1),
2938
                                       vir_REVF(c)));
2939
                break;
2940

2941
        case nir_intrinsic_load_base_instance:
2942
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
2943
                break;
2944

2945
        case nir_intrinsic_load_instance_id:
2946
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
2947
                break;
2948

2949
        case nir_intrinsic_load_vertex_id:
2950
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
2951
                break;
2952

2953
        case nir_intrinsic_load_tlb_color_v3d:
2954
                vir_emit_tlb_color_read(c, instr);
2955
                break;
2956

2957
        case nir_intrinsic_load_input:
2958
                ntq_emit_load_input(c, instr);
2959
                break;
2960

2961
        case nir_intrinsic_store_tlb_sample_color_v3d:
2962
               ntq_emit_per_sample_color_write(c, instr);
2963
               break;
2964

2965
       case nir_intrinsic_store_output:
2966
                ntq_emit_store_output(c, instr);
2967
                break;
2968

2969
        case nir_intrinsic_image_size:
2970
                ntq_emit_image_size(c, instr);
2971
                break;
2972

2973
        case nir_intrinsic_discard:
2974
                ntq_flush_tmu(c);
2975

2976
                if (vir_in_nonuniform_control_flow(c)) {
2977
                        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2978
                                   V3D_QPU_PF_PUSHZ);
2979
                        vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
2980
                                                     vir_uniform_ui(c, 0)),
2981
                                V3D_QPU_COND_IFA);
2982
                } else {
2983
                        vir_SETMSF_dest(c, vir_nop_reg(),
2984
                                        vir_uniform_ui(c, 0));
2985
                }
2986
                break;
2987

2988
        case nir_intrinsic_discard_if: {
2989
                ntq_flush_tmu(c);
2990

2991
                enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
2992

2993
                if (vir_in_nonuniform_control_flow(c)) {
2994
                        struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
2995
                                                               c->execute);
2996
                        if (cond == V3D_QPU_COND_IFA) {
2997
                                vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
2998
                        } else {
2999
                                vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3000
                                cond = V3D_QPU_COND_IFA;
3001
                        }
3002
                }
3003

3004
                vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3005
                                             vir_uniform_ui(c, 0)), cond);
3006

3007
                break;
3008
        }
3009

3010
        case nir_intrinsic_memory_barrier:
3011
        case nir_intrinsic_memory_barrier_buffer:
3012
        case nir_intrinsic_memory_barrier_image:
3013
        case nir_intrinsic_memory_barrier_shared:
3014
        case nir_intrinsic_memory_barrier_tcs_patch:
3015
        case nir_intrinsic_group_memory_barrier:
3016
                /* We don't do any instruction scheduling of these NIR
3017
                 * instructions between each other, so we just need to make
3018
                 * sure that the TMU operations before the barrier are flushed
3019
                 * before the ones after the barrier.
3020
                 */
3021
                ntq_flush_tmu(c);
3022
                break;
3023

3024
        case nir_intrinsic_control_barrier:
3025
                /* Emit a TSY op to get all invocations in the workgroup
3026
                 * (actually supergroup) to block until the last invocation
3027
                 * reaches the TSY op.
3028
                 */
3029
                ntq_flush_tmu(c);
3030

3031
                if (c->devinfo->ver >= 42) {
3032
                        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3033
                                                      V3D_QPU_WADDR_SYNCB));
3034
                } else {
3035
                        struct qinst *sync =
3036
                                vir_BARRIERID_dest(c,
3037
                                                   vir_reg(QFILE_MAGIC,
3038
                                                           V3D_QPU_WADDR_SYNCU));
3039
                        sync->uniform =
3040
                                vir_get_uniform_index(c, QUNIFORM_CONSTANT,
3041
                                                      0xffffff00 |
3042
                                                      V3D_TSY_WAIT_INC_CHECK);
3043

3044
                }
3045

3046
                /* The blocking of a TSY op only happens at the next thread
3047
                 * switch.  No texturing may be outstanding at the time of a
3048
                 * TSY blocking operation.
3049
                 */
3050
                vir_emit_thrsw(c);
3051
                break;
3052

3053
        case nir_intrinsic_load_num_workgroups:
3054
                for (int i = 0; i < 3; i++) {
3055
                        ntq_store_dest(c, &instr->dest, i,
3056
                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3057
                                                   i));
3058
                }
3059
                break;
3060

3061
        case nir_intrinsic_load_workgroup_id: {
3062
                struct qreg x = vir_AND(c, c->cs_payload[0],
3063
                                         vir_uniform_ui(c, 0xffff));
3064

3065
                struct qreg y = vir_SHR(c, c->cs_payload[0],
3066
                                         vir_uniform_ui(c, 16));
3067

3068
                struct qreg z = vir_AND(c, c->cs_payload[1],
3069
                                         vir_uniform_ui(c, 0xffff));
3070

3071
                /* We only support dispatch base in Vulkan */
3072
                if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
3073
                        x = vir_ADD(c, x,
3074
                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
3075
                        y = vir_ADD(c, y,
3076
                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
3077
                        z = vir_ADD(c, z,
3078
                                    vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
3079
                }
3080

3081
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
3082
                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
3083
                ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
3084
                break;
3085
        }
3086

3087
        case nir_intrinsic_load_local_invocation_index:
3088
                ntq_store_dest(c, &instr->dest, 0,
3089
                               emit_load_local_invocation_index(c));
3090
                break;
3091

3092
        case nir_intrinsic_load_subgroup_id: {
3093
                /* This is basically the batch index, which is the Local
3094
                 * Invocation Index divided by the SIMD width).
3095
                 */
3096
                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
3097
                const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3098
                struct qreg lii = emit_load_local_invocation_index(c);
3099
                ntq_store_dest(c, &instr->dest, 0,
3100
                               vir_SHR(c, lii,
3101
                                       vir_uniform_ui(c, divide_shift)));
3102
                break;
3103
        }
3104

3105
        case nir_intrinsic_load_per_vertex_input: {
3106
                /* The vertex shader writes all its used outputs into
3107
                 * consecutive VPM offsets, so if any output component is
3108
                 * unused, its VPM offset is used by the next used
3109
                 * component. This means that we can't assume that each
3110
                 * location will use 4 consecutive scalar offsets in the VPM
3111
                 * and we need to compute the VPM offset for each input by
3112
                 * going through the inputs and finding the one that matches
3113
                 * our location and component.
3114
                 *
3115
                 * col: vertex index, row = varying index
3116
                 */
3117
                assert(nir_src_is_const(instr->src[1]));
3118
                uint32_t location =
3119
                        nir_intrinsic_io_semantics(instr).location +
3120
                        nir_src_as_uint(instr->src[1]);
3121
                uint32_t component = nir_intrinsic_component(instr);
3122

3123
                int32_t row_idx = -1;
3124
                for (int i = 0; i < c->num_inputs; i++) {
3125
                        struct v3d_varying_slot slot = c->input_slots[i];
3126
                        if (v3d_slot_get_slot(slot) == location &&
3127
                            v3d_slot_get_component(slot) == component) {
3128
                                row_idx = i;
3129
                                break;
3130
                        }
3131
                }
3132

3133
                assert(row_idx != -1);
3134

3135
                struct qreg col = ntq_get_src(c, instr->src[0], 0);
3136
                for (int i = 0; i < instr->num_components; i++) {
3137
                        struct qreg row = vir_uniform_ui(c, row_idx++);
3138
                        ntq_store_dest(c, &instr->dest, i,
3139
                                       vir_LDVPMG_IN(c, row, col));
3140
                }
3141
                break;
3142
        }
3143

3144
        case nir_intrinsic_emit_vertex:
3145
        case nir_intrinsic_end_primitive:
3146
                unreachable("Should have been lowered in v3d_nir_lower_io");
3147
                break;
3148

3149
        case nir_intrinsic_load_primitive_id: {
3150
                /* gl_PrimitiveIdIn is written by the GBG in the first word of
3151
                 * VPM output header. According to docs, we should read this
3152
                 * using ldvpm(v,d)_in (See Table 71).
3153
                 */
3154
                assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3155
                ntq_store_dest(c, &instr->dest, 0,
3156
                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3157
                break;
3158
        }
3159

3160
        case nir_intrinsic_load_invocation_id:
3161
                ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
3162
                break;
3163

3164
        case nir_intrinsic_load_fb_layers_v3d:
3165
                ntq_store_dest(c, &instr->dest, 0,
3166
                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3167
                break;
3168

3169
        case nir_intrinsic_load_sample_id:
3170
                ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
3171
                break;
3172

3173
        case nir_intrinsic_load_sample_pos:
3174
                ntq_store_dest(c, &instr->dest, 0,
3175
                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3176
                ntq_store_dest(c, &instr->dest, 1,
3177
                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3178
                break;
3179

3180
        case nir_intrinsic_load_barycentric_at_offset:
3181
                ntq_store_dest(c, &instr->dest, 0,
3182
                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3183
                ntq_store_dest(c, &instr->dest, 1,
3184
                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3185
                break;
3186

3187
        case nir_intrinsic_load_barycentric_pixel:
3188
                ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3189
                ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3190
                break;
3191

3192
        case nir_intrinsic_load_barycentric_at_sample: {
3193
                if (!c->fs_key->msaa) {
3194
                        ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3195
                        ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3196
                        return;
3197
                }
3198

3199
                struct qreg offset_x, offset_y;
3200
                struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3201
                ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3202

3203
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3204
                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3205
                break;
3206
        }
3207

3208
        case nir_intrinsic_load_barycentric_sample: {
3209
                struct qreg offset_x =
3210
                        vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3211
                struct qreg offset_y =
3212
                        vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3213

3214
                ntq_store_dest(c, &instr->dest, 0,
3215
                                  vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3216
                ntq_store_dest(c, &instr->dest, 1,
3217
                                  vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3218
                break;
3219
        }
3220

3221
        case nir_intrinsic_load_barycentric_centroid: {
3222
                struct qreg offset_x, offset_y;
3223
                ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3224
                ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3225
                ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3226
                break;
3227
        }
3228

3229
        case nir_intrinsic_load_interpolated_input: {
3230
                assert(nir_src_is_const(instr->src[1]));
3231
                const uint32_t offset = nir_src_as_uint(instr->src[1]);
3232

3233
                for (int i = 0; i < instr->num_components; i++) {
3234
                        const uint32_t input_idx =
3235
                                (nir_intrinsic_base(instr) + offset) * 4 +
3236
                                nir_intrinsic_component(instr) + i;
3237

3238
                        /* If we are not in MSAA or if we are not interpolating
3239
                         * a user varying, just return the pre-computed
3240
                         * interpolated input.
3241
                         */
3242
                        if (!c->fs_key->msaa ||
3243
                            c->interp[input_idx].vp.file == QFILE_NULL) {
3244
                                ntq_store_dest(c, &instr->dest, i,
3245
                                               vir_MOV(c, c->inputs[input_idx]));
3246
                                continue;
3247
                        }
3248

3249
                        /* Otherwise compute interpolation at the specified
3250
                         * offset.
3251
                         */
3252
                        struct qreg p = c->interp[input_idx].vp;
3253
                        struct qreg C = c->interp[input_idx].C;
3254
                        unsigned interp_mode =  c->interp[input_idx].mode;
3255

3256
                        struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3257
                        struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3258

3259
                        struct qreg result =
3260
                              ntq_emit_load_interpolated_input(c, p, C,
3261
                                                               offset_x, offset_y,
3262
                                                               interp_mode);
3263
                        ntq_store_dest(c, &instr->dest, i, result);
3264
                }
3265
                break;
3266
        }
3267

3268
        case nir_intrinsic_load_subgroup_size:
3269
                ntq_store_dest(c, &instr->dest, 0,
3270
                               vir_uniform_ui(c, V3D_CHANNELS));
3271
                break;
3272

3273
        case nir_intrinsic_load_subgroup_invocation:
3274
                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
3275
                break;
3276

3277
        case nir_intrinsic_elect: {
3278
                set_a_flags_for_subgroup(c);
3279
                struct qreg first = vir_FLAFIRST(c);
3280

3281
                /* Produce a boolean result from Flafirst */
3282
                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3283
                                           first, vir_uniform_ui(c, 1)),
3284
                                           V3D_QPU_PF_PUSHZ);
3285
                struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3286
                ntq_store_dest(c, &instr->dest, 0, result);
3287
                break;
3288
        }
3289

3290
        case nir_intrinsic_load_num_subgroups:
3291
                unreachable("Should have been lowered");
3292
                break;
3293

3294
        default:
3295
                fprintf(stderr, "Unknown intrinsic: ");
3296
                nir_print_instr(&instr->instr, stderr);
3297
                fprintf(stderr, "\n");
3298
                break;
3299
        }
3300
}
3301

3302
/* Clears (activates) the execute flags for any channels whose jump target
3303
 * matches this block.
3304
 *
3305
 * XXX perf: Could we be using flpush/flpop somehow for our execution channel
3306
 * enabling?
3307
 *
3308
 */
3309
static void
3310
ntq_activate_execute_for_block(struct v3d_compile *c)
3311
{
3312
        vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3313
                                c->execute, vir_uniform_ui(c, c->cur_block->index)),
3314
                   V3D_QPU_PF_PUSHZ);
3315

3316
        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3317
}
3318

3319
static void
3320
ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
3321
{
3322
        nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3323
        bool empty_else_block =
3324
                (nir_else_block == nir_if_last_else_block(if_stmt) &&
3325
                 exec_list_is_empty(&nir_else_block->instr_list));
3326

3327
        struct qblock *then_block = vir_new_block(c);
3328
        struct qblock *after_block = vir_new_block(c);
3329
        struct qblock *else_block;
3330
        if (empty_else_block)
3331
                else_block = after_block;
3332
        else
3333
                else_block = vir_new_block(c);
3334

3335
        /* Check if this if statement is really just a conditional jump with
3336
         * the form:
3337
         *
3338
         * if (cond) {
3339
         *    break/continue;
3340
         * } else {
3341
         * }
3342
         *
3343
         * In which case we can skip the jump to ELSE we emit before the THEN
3344
         * block and instead just emit the break/continue directly.
3345
         */
3346
        nir_jump_instr *conditional_jump = NULL;
3347
        if (empty_else_block) {
3348
                nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
3349
                struct nir_instr *inst = nir_block_first_instr(nir_then_block);
3350
                if (inst && inst->type == nir_instr_type_jump)
3351
                        conditional_jump = nir_instr_as_jump(inst);
3352
        }
3353

3354
        /* Set up the flags for the IF condition (taking the THEN branch). */
3355
        enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3356

3357
        if (!conditional_jump) {
3358
                /* Jump to ELSE. */
3359
                struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3360
                           V3D_QPU_BRANCH_COND_ANYNA :
3361
                           V3D_QPU_BRANCH_COND_ANYA);
3362
                /* Pixels that were not dispatched or have been discarded
3363
                 * should not contribute to the ANYA/ANYNA condition.
3364
                 */
3365
                branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3366

3367
                vir_link_blocks(c->cur_block, else_block);
3368
                vir_link_blocks(c->cur_block, then_block);
3369

3370
                /* Process the THEN block. */
3371
                vir_set_emit_block(c, then_block);
3372
                ntq_emit_cf_list(c, &if_stmt->then_list);
3373

3374
                if (!empty_else_block) {
3375
                        /* At the end of the THEN block, jump to ENDIF, unless
3376
                         * the block ended in a break or continue.
3377
                         */
3378
                        if (!c->cur_block->branch_emitted) {
3379
                                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3380
                                vir_link_blocks(c->cur_block, after_block);
3381
                        }
3382

3383
                        /* Emit the else block. */
3384
                        vir_set_emit_block(c, else_block);
3385
                        ntq_emit_cf_list(c, &if_stmt->else_list);
3386
                }
3387
        } else {
3388
                /* Emit the conditional jump directly.
3389
                 *
3390
                 * Use ALL with breaks and ANY with continues to ensure that
3391
                 * we always break and never continue when all lanes have been
3392
                 * disabled (for example because of discards) to prevent
3393
                 * infinite loops.
3394
                 */
3395
                assert(conditional_jump &&
3396
                       (conditional_jump->type == nir_jump_continue ||
3397
                        conditional_jump->type == nir_jump_break));
3398

3399
                struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3400
                           (conditional_jump->type == nir_jump_break ?
3401
                            V3D_QPU_BRANCH_COND_ALLA :
3402
                            V3D_QPU_BRANCH_COND_ANYA) :
3403
                           (conditional_jump->type == nir_jump_break ?
3404
                            V3D_QPU_BRANCH_COND_ALLNA :
3405
                            V3D_QPU_BRANCH_COND_ANYNA));
3406
                branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3407

3408
                vir_link_blocks(c->cur_block,
3409
                                conditional_jump->type == nir_jump_break ?
3410
                                        c->loop_break_block :
3411
                                        c->loop_cont_block);
3412
        }
3413

3414
        vir_link_blocks(c->cur_block, after_block);
3415

3416
        vir_set_emit_block(c, after_block);
3417
}
3418

3419
static void
3420
ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
3421
{
3422
        nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3423
        bool empty_else_block =
3424
                (nir_else_block == nir_if_last_else_block(if_stmt) &&
3425
                 exec_list_is_empty(&nir_else_block->instr_list));
3426

3427
        struct qblock *then_block = vir_new_block(c);
3428
        struct qblock *after_block = vir_new_block(c);
3429
        struct qblock *else_block;
3430
        if (empty_else_block)
3431
                else_block = after_block;
3432
        else
3433
                else_block = vir_new_block(c);
3434

3435
        bool was_uniform_control_flow = false;
3436
        if (!vir_in_nonuniform_control_flow(c)) {
3437
                c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3438
                was_uniform_control_flow = true;
3439
        }
3440

3441
        /* Set up the flags for the IF condition (taking the THEN branch). */
3442
        enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3443

3444
        /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
3445
         * was previously active (execute Z) for updating the exec flags.
3446
         */
3447
        if (was_uniform_control_flow) {
3448
                cond = v3d_qpu_cond_invert(cond);
3449
        } else {
3450
                struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
3451
                if (cond == V3D_QPU_COND_IFA) {
3452
                        vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
3453
                } else {
3454
                        vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
3455
                        cond = V3D_QPU_COND_IFA;
3456
                }
3457
        }
3458

3459
        vir_MOV_cond(c, cond,
3460
                     c->execute,
3461
                     vir_uniform_ui(c, else_block->index));
3462

3463
        /* Jump to ELSE if nothing is active for THEN, otherwise fall
3464
         * through.
3465
         */
3466
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3467
        vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
3468
        vir_link_blocks(c->cur_block, else_block);
3469
        vir_link_blocks(c->cur_block, then_block);
3470

3471
        /* Process the THEN block. */
3472
        vir_set_emit_block(c, then_block);
3473
        ntq_emit_cf_list(c, &if_stmt->then_list);
3474

3475
        if (!empty_else_block) {
3476
                /* Handle the end of the THEN block.  First, all currently
3477
                 * active channels update their execute flags to point to
3478
                 * ENDIF
3479
                 */
3480
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3481
                           V3D_QPU_PF_PUSHZ);
3482
                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3483
                             vir_uniform_ui(c, after_block->index));
3484

3485
                /* If everything points at ENDIF, then jump there immediately. */
3486
                vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3487
                                        c->execute,
3488
                                        vir_uniform_ui(c, after_block->index)),
3489
                           V3D_QPU_PF_PUSHZ);
3490
                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
3491
                vir_link_blocks(c->cur_block, after_block);
3492
                vir_link_blocks(c->cur_block, else_block);
3493

3494
                vir_set_emit_block(c, else_block);
3495
                ntq_activate_execute_for_block(c);
3496
                ntq_emit_cf_list(c, &if_stmt->else_list);
3497
        }
3498

3499
        vir_link_blocks(c->cur_block, after_block);
3500

3501
        vir_set_emit_block(c, after_block);
3502
        if (was_uniform_control_flow)
3503
                c->execute = c->undef;
3504
        else
3505
                ntq_activate_execute_for_block(c);
3506
}
3507

3508
static void
3509
ntq_emit_if(struct v3d_compile *c, nir_if *nif)
3510
{
3511
        bool was_in_control_flow = c->in_control_flow;
3512
        c->in_control_flow = true;
3513
        if (!vir_in_nonuniform_control_flow(c) &&
3514
            !nir_src_is_divergent(nif->condition)) {
3515
                ntq_emit_uniform_if(c, nif);
3516
        } else {
3517
                ntq_emit_nonuniform_if(c, nif);
3518
        }
3519
        c->in_control_flow = was_in_control_flow;
3520
}
3521

3522
static void
3523
ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
3524
{
3525
        switch (jump->type) {
3526
        case nir_jump_break:
3527
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3528
                           V3D_QPU_PF_PUSHZ);
3529
                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3530
                             vir_uniform_ui(c, c->loop_break_block->index));
3531
                break;
3532

3533
        case nir_jump_continue:
3534
                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3535
                           V3D_QPU_PF_PUSHZ);
3536
                vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3537
                             vir_uniform_ui(c, c->loop_cont_block->index));
3538
                break;
3539

3540
        case nir_jump_return:
3541
                unreachable("All returns should be lowered\n");
3542
                break;
3543

3544
        case nir_jump_halt:
3545
        case nir_jump_goto:
3546
        case nir_jump_goto_if:
3547
                unreachable("not supported\n");
3548
                break;
3549
        }
3550
}
3551

3552
static void
3553
ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
3554
{
3555
        switch (jump->type) {
3556
        case nir_jump_break:
3557
                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3558
                vir_link_blocks(c->cur_block, c->loop_break_block);
3559
                c->cur_block->branch_emitted = true;
3560
                break;
3561
        case nir_jump_continue:
3562
                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3563
                vir_link_blocks(c->cur_block, c->loop_cont_block);
3564
                c->cur_block->branch_emitted = true;
3565
                break;
3566

3567
        case nir_jump_return:
3568
                unreachable("All returns should be lowered\n");
3569
                break;
3570

3571
        case nir_jump_halt:
3572
        case nir_jump_goto:
3573
        case nir_jump_goto_if:
3574
                unreachable("not supported\n");
3575
                break;
3576
        }
3577
}
3578

3579
static void
3580
ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
3581
{
3582
        switch (instr->type) {
3583
        case nir_instr_type_alu:
3584
                ntq_emit_alu(c, nir_instr_as_alu(instr));
3585
                break;
3586

3587
        case nir_instr_type_intrinsic:
3588
                ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
3589
                break;
3590

3591
        case nir_instr_type_load_const:
3592
                ntq_emit_load_const(c, nir_instr_as_load_const(instr));
3593
                break;
3594

3595
        case nir_instr_type_ssa_undef:
3596
                unreachable("Should've been lowered by nir_lower_undef_to_zero");
3597
                break;
3598

3599
        case nir_instr_type_tex:
3600
                ntq_emit_tex(c, nir_instr_as_tex(instr));
3601
                break;
3602

3603
        case nir_instr_type_jump:
3604
                /* Always flush TMU before jumping to another block, for the
3605
                 * same reasons as in ntq_emit_block.
3606
                 */
3607
                ntq_flush_tmu(c);
3608
                if (vir_in_nonuniform_control_flow(c))
3609
                        ntq_emit_jump(c, nir_instr_as_jump(instr));
3610
                else
3611
                        ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
3612
                break;
3613

3614
        default:
3615
                fprintf(stderr, "Unknown NIR instr type: ");
3616
                nir_print_instr(instr, stderr);
3617
                fprintf(stderr, "\n");
3618
                abort();
3619
        }
3620
}
3621

3622
static void
3623
ntq_emit_block(struct v3d_compile *c, nir_block *block)
3624
{
3625
        nir_foreach_instr(instr, block) {
3626
                ntq_emit_instr(c, instr);
3627
        }
3628

3629
        /* Always process pending TMU operations in the same block they were
3630
         * emitted: we can't emit TMU operations in a block and then emit a
3631
         * thread switch and LDTMU/TMUWT for them in another block, possibly
3632
         * under control flow.
3633
         */
3634
        ntq_flush_tmu(c);
3635
}
3636

3637
static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
3638

3639
static void
3640
ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
3641
{
3642
        bool was_uniform_control_flow = false;
3643
        if (!vir_in_nonuniform_control_flow(c)) {
3644
                c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3645
                was_uniform_control_flow = true;
3646
        }
3647

3648
        c->loop_cont_block = vir_new_block(c);
3649
        c->loop_break_block = vir_new_block(c);
3650

3651
        vir_link_blocks(c->cur_block, c->loop_cont_block);
3652
        vir_set_emit_block(c, c->loop_cont_block);
3653
        ntq_activate_execute_for_block(c);
3654

3655
        ntq_emit_cf_list(c, &loop->body);
3656

3657
        /* Re-enable any previous continues now, so our ANYA check below
3658
         * works.
3659
         *
3660
         * XXX: Use the .ORZ flags update, instead.
3661
         */
3662
        vir_set_pf(c, vir_XOR_dest(c,
3663
                                vir_nop_reg(),
3664
                                c->execute,
3665
                                vir_uniform_ui(c, c->loop_cont_block->index)),
3666
                   V3D_QPU_PF_PUSHZ);
3667
        vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3668

3669
        vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3670

3671
        struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
3672
        /* Pixels that were not dispatched or have been discarded should not
3673
         * contribute to looping again.
3674
         */
3675
        branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3676
        vir_link_blocks(c->cur_block, c->loop_cont_block);
3677
        vir_link_blocks(c->cur_block, c->loop_break_block);
3678

3679
        vir_set_emit_block(c, c->loop_break_block);
3680
        if (was_uniform_control_flow)
3681
                c->execute = c->undef;
3682
        else
3683
                ntq_activate_execute_for_block(c);
3684
}
3685

3686
static void
3687
ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
3688
{
3689

3690
        c->loop_cont_block = vir_new_block(c);
3691
        c->loop_break_block = vir_new_block(c);
3692

3693
        vir_link_blocks(c->cur_block, c->loop_cont_block);
3694
        vir_set_emit_block(c, c->loop_cont_block);
3695

3696
        ntq_emit_cf_list(c, &loop->body);
3697

3698
        if (!c->cur_block->branch_emitted) {
3699
                vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3700
                vir_link_blocks(c->cur_block, c->loop_cont_block);
3701
        }
3702

3703
        vir_set_emit_block(c, c->loop_break_block);
3704
}
3705

3706
static void
3707
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
3708
{
3709
        bool was_in_control_flow = c->in_control_flow;
3710
        c->in_control_flow = true;
3711

3712
        struct qblock *save_loop_cont_block = c->loop_cont_block;
3713
        struct qblock *save_loop_break_block = c->loop_break_block;
3714

3715
        if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
3716
                ntq_emit_nonuniform_loop(c, loop);
3717
        } else {
3718
                ntq_emit_uniform_loop(c, loop);
3719
        }
3720

3721
        c->loop_break_block = save_loop_break_block;
3722
        c->loop_cont_block = save_loop_cont_block;
3723

3724
        c->loops++;
3725

3726
        c->in_control_flow = was_in_control_flow;
3727
}
3728

3729
static void
3730
ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
3731
{
3732
        fprintf(stderr, "FUNCTIONS not handled.\n");
3733
        abort();
3734
}
3735

3736
static void
3737
ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
3738
{
3739
        foreach_list_typed(nir_cf_node, node, node, list) {
3740
                switch (node->type) {
3741
                case nir_cf_node_block:
3742
                        ntq_emit_block(c, nir_cf_node_as_block(node));
3743
                        break;
3744

3745
                case nir_cf_node_if:
3746
                        ntq_emit_if(c, nir_cf_node_as_if(node));
3747
                        break;
3748

3749
                case nir_cf_node_loop:
3750
                        ntq_emit_loop(c, nir_cf_node_as_loop(node));
3751
                        break;
3752

3753
                case nir_cf_node_function:
3754
                        ntq_emit_function(c, nir_cf_node_as_function(node));
3755
                        break;
3756

3757
                default:
3758
                        fprintf(stderr, "Unknown NIR node type\n");
3759
                        abort();
3760
                }
3761
        }
3762
}
3763

3764
static void
3765
ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
3766
{
3767
        ntq_setup_registers(c, &impl->registers);
3768
        ntq_emit_cf_list(c, &impl->body);
3769
}
3770

3771
static void
3772
nir_to_vir(struct v3d_compile *c)
3773
{
3774
        switch (c->s->info.stage) {
3775
        case MESA_SHADER_FRAGMENT:
3776
                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
3777
                c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
3778
                c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
3779

3780
                /* V3D 4.x can disable implicit varyings if they are not used */
3781
                c->fs_uses_primitive_id =
3782
                        nir_find_variable_with_location(c->s, nir_var_shader_in,
3783
                                                        VARYING_SLOT_PRIMITIVE_ID);
3784
                if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
3785
                       c->primitive_id =
3786
                               emit_fragment_varying(c, NULL, -1, 0, 0);
3787
                }
3788

3789
                if (c->fs_key->is_points &&
3790
                    (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
3791
                        c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3792
                        c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
3793
                        c->uses_implicit_point_line_varyings = true;
3794
                } else if (c->fs_key->is_lines &&
3795
                           (c->devinfo->ver < 40 ||
3796
                            BITSET_TEST(c->s->info.system_values_read,
3797
                                        SYSTEM_VALUE_LINE_COORD))) {
3798
                        c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3799
                        c->uses_implicit_point_line_varyings = true;
3800
                }
3801

3802
                c->force_per_sample_msaa =
3803
                   c->s->info.fs.uses_sample_qualifier ||
3804
                   BITSET_TEST(c->s->info.system_values_read,
3805
                               SYSTEM_VALUE_SAMPLE_ID) ||
3806
                   BITSET_TEST(c->s->info.system_values_read,
3807
                               SYSTEM_VALUE_SAMPLE_POS);
3808
                break;
3809
        case MESA_SHADER_COMPUTE:
3810
                /* Set up the TSO for barriers, assuming we do some. */
3811
                if (c->devinfo->ver < 42) {
3812
                        vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3813
                                                      V3D_QPU_WADDR_SYNC));
3814
                }
3815

3816
                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
3817
                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
3818

3819
                /* Set up the division between gl_LocalInvocationIndex and
3820
                 * wg_in_mem in the payload reg.
3821
                 */
3822
                int wg_size = (c->s->info.workgroup_size[0] *
3823
                               c->s->info.workgroup_size[1] *
3824
                               c->s->info.workgroup_size[2]);
3825
                c->local_invocation_index_bits =
3826
                        ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
3827
                assert(c->local_invocation_index_bits <= 8);
3828

3829
                if (c->s->info.shared_size) {
3830
                        struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
3831
                                                        vir_uniform_ui(c, 16));
3832
                        if (c->s->info.workgroup_size[0] != 1 ||
3833
                            c->s->info.workgroup_size[1] != 1 ||
3834
                            c->s->info.workgroup_size[2] != 1) {
3835
                                int wg_bits = (16 -
3836
                                               c->local_invocation_index_bits);
3837
                                int wg_mask = (1 << wg_bits) - 1;
3838
                                wg_in_mem = vir_AND(c, wg_in_mem,
3839
                                                    vir_uniform_ui(c, wg_mask));
3840
                        }
3841
                        struct qreg shared_per_wg =
3842
                                vir_uniform_ui(c, c->s->info.shared_size);
3843

3844
                        c->cs_shared_offset =
3845
                                vir_ADD(c,
3846
                                        vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
3847
                                        vir_UMUL(c, wg_in_mem, shared_per_wg));
3848
                }
3849
                break;
3850
        default:
3851
                break;
3852
        }
3853

3854
        if (c->s->scratch_size) {
3855
                v3d_setup_spill_base(c);
3856
                c->spill_size += V3D_CHANNELS * c->s->scratch_size;
3857
        }
3858

3859
        switch (c->s->info.stage) {
3860
        case MESA_SHADER_VERTEX:
3861
                ntq_setup_vs_inputs(c);
3862
                break;
3863
        case MESA_SHADER_GEOMETRY:
3864
                ntq_setup_gs_inputs(c);
3865
                break;
3866
        case MESA_SHADER_FRAGMENT:
3867
                ntq_setup_fs_inputs(c);
3868
                break;
3869
        case MESA_SHADER_COMPUTE:
3870
                break;
3871
        default:
3872
                unreachable("unsupported shader stage");
3873
        }
3874

3875
        ntq_setup_outputs(c);
3876

3877
        /* Find the main function and emit the body. */
3878
        nir_foreach_function(function, c->s) {
3879
                assert(strcmp(function->name, "main") == 0);
3880
                assert(function->impl);
3881
                ntq_emit_impl(c, function->impl);
3882
        }
3883
}
3884

3885
/**
3886
 * When demoting a shader down to single-threaded, removes the THRSW
3887
 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
3888
 * program end).
3889
 */
3890
static void
3891
vir_remove_thrsw(struct v3d_compile *c)
3892
{
3893
        vir_for_each_block(block, c) {
3894
                vir_for_each_inst_safe(inst, block) {
3895
                        if (inst->qpu.sig.thrsw)
3896
                                vir_remove_instruction(c, inst);
3897
                }
3898
        }
3899

3900
        c->last_thrsw = NULL;
3901
}
3902

3903
void
3904
vir_emit_last_thrsw(struct v3d_compile *c)
3905
{
3906
        /* On V3D before 4.1, we need a TMU op to be outstanding when thread
3907
         * switching, so disable threads if we didn't do any TMU ops (each of
3908
         * which would have emitted a THRSW).
3909
         */
3910
        if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
3911
                c->threads = 1;
3912
                if (c->last_thrsw)
3913
                        vir_remove_thrsw(c);
3914
                return;
3915
        }
3916

3917
        /* If we're threaded and the last THRSW was in conditional code, then
3918
         * we need to emit another one so that we can flag it as the last
3919
         * thrsw.
3920
         */
3921
        if (c->last_thrsw && !c->last_thrsw_at_top_level) {
3922
                assert(c->devinfo->ver >= 41);
3923
                vir_emit_thrsw(c);
3924
        }
3925

3926
        /* If we're threaded, then we need to mark the last THRSW instruction
3927
         * so we can emit a pair of them at QPU emit time.
3928
         *
3929
         * For V3D 4.x, we can spawn the non-fragment shaders already in the
3930
         * post-last-THRSW state, so we can skip this.
3931
         */
3932
        if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
3933
                assert(c->devinfo->ver >= 41);
3934
                vir_emit_thrsw(c);
3935
        }
3936

3937
        if (c->last_thrsw)
3938
                c->last_thrsw->is_last_thrsw = true;
3939
}
3940

3941
/* There's a flag in the shader for "center W is needed for reasons other than
3942
 * non-centroid varyings", so we just walk the program after VIR optimization
3943
 * to see if it's used.  It should be harmless to set even if we only use
3944
 * center W for varyings.
3945
 */
3946
static void
3947
vir_check_payload_w(struct v3d_compile *c)
3948
{
3949
        if (c->s->info.stage != MESA_SHADER_FRAGMENT)
3950
                return;
3951

3952
        vir_for_each_inst_inorder(inst, c) {
3953
                for (int i = 0; i < vir_get_nsrc(inst); i++) {
3954
                        if (inst->src[i].file == QFILE_REG &&
3955
                            inst->src[i].index == 0) {
3956
                                c->uses_center_w = true;
3957
                                return;
3958
                        }
3959
                }
3960
        }
3961
}
3962

3963
void
3964
v3d_nir_to_vir(struct v3d_compile *c)
3965
{
3966
        if (V3D_DEBUG & (V3D_DEBUG_NIR |
3967
                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3968
                fprintf(stderr, "%s prog %d/%d NIR:\n",
3969
                        vir_get_stage_name(c),
3970
                        c->program_id, c->variant_id);
3971
                nir_print_shader(c->s, stderr);
3972
        }
3973

3974
        nir_to_vir(c);
3975

3976
        /* Emit the last THRSW before STVPM and TLB writes. */
3977
        vir_emit_last_thrsw(c);
3978

3979
        switch (c->s->info.stage) {
3980
        case MESA_SHADER_FRAGMENT:
3981
                emit_frag_end(c);
3982
                break;
3983
        case MESA_SHADER_GEOMETRY:
3984
                emit_geom_end(c);
3985
                break;
3986
        case MESA_SHADER_VERTEX:
3987
                emit_vert_end(c);
3988
                break;
3989
        case MESA_SHADER_COMPUTE:
3990
                break;
3991
        default:
3992
                unreachable("bad stage");
3993
        }
3994

3995
        if (V3D_DEBUG & (V3D_DEBUG_VIR |
3996
                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
3997
                fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
3998
                        vir_get_stage_name(c),
3999
                        c->program_id, c->variant_id);
4000
                vir_dump(c);
4001
                fprintf(stderr, "\n");
4002
        }
4003

4004
        vir_optimize(c);
4005

4006
        vir_check_payload_w(c);
4007

4008
        /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4009
         * We used that on that platform to pipeline TMU writes and reduce the
4010
         * number of thread switches, as well as try (mostly successfully) to
4011
         * reduce maximum register pressure to allow more threads.  We should
4012
         * do something of that sort for V3D -- either instruction scheduling
4013
         * here, or delay the the THRSW and LDTMUs from our texture
4014
         * instructions until the results are needed.
4015
         */
4016

4017
        if (V3D_DEBUG & (V3D_DEBUG_VIR |
4018
                         v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4019
                fprintf(stderr, "%s prog %d/%d VIR:\n",
4020
                        vir_get_stage_name(c),
4021
                        c->program_id, c->variant_id);
4022
                vir_dump(c);
4023
                fprintf(stderr, "\n");
4024
        }
4025

4026
        /* Attempt to allocate registers for the temporaries.  If we fail,
4027
         * reduce thread count and try again.
4028
         */
4029
        int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
4030
        struct qpu_reg *temp_registers;
4031
        while (true) {
4032
                bool spilled;
4033
                temp_registers = v3d_register_allocate(c, &spilled);
4034
                if (spilled)
4035
                        continue;
4036

4037
                if (temp_registers)
4038
                        break;
4039

4040
                if (c->threads == min_threads &&
4041
                    (V3D_DEBUG & V3D_DEBUG_RA)) {
4042
                        fprintf(stderr,
4043
                                "Failed to register allocate using %s\n",
4044
                                c->fallback_scheduler ? "the fallback scheduler:" :
4045
                                "the normal scheduler: \n");
4046

4047
                        vir_dump(c);
4048

4049
                        char *shaderdb;
4050
                        int ret = v3d_shaderdb_dump(c, &shaderdb);
4051
                        if (ret > 0) {
4052
                                fprintf(stderr, "%s\n", shaderdb);
4053
                                free(shaderdb);
4054
                        }
4055
                }
4056

4057
                if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
4058
                        if (V3D_DEBUG & V3D_DEBUG_PERF) {
4059
                                fprintf(stderr,
4060
                                        "Failed to register allocate %s at "
4061
                                        "%d threads.\n", vir_get_stage_name(c),
4062
                                        c->threads);
4063
                        }
4064
                        c->compilation_result =
4065
                                V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
4066
                        return;
4067
                }
4068

4069
                c->spill_count = 0;
4070
                c->threads /= 2;
4071

4072
                if (c->threads == 1)
4073
                        vir_remove_thrsw(c);
4074
        }
4075

4076
        if (c->spills &&
4077
            (V3D_DEBUG & (V3D_DEBUG_VIR |
4078
                          v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
4079
                fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
4080
                        vir_get_stage_name(c),
4081
                        c->program_id, c->variant_id);
4082
                vir_dump(c);
4083
                fprintf(stderr, "\n");
4084
        }
4085

4086
        v3d_vir_to_qpu(c, temp_registers);
4087
}
4088

4089
Product

Resources

Company