CoCalc -- vc4_program.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_program.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright (c) 2014 Scott Mansell
3
 * Copyright © 2014 Broadcom
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
#include <inttypes.h>
26
#include "util/format/u_format.h"
27
#include "util/crc32.h"
28
#include "util/u_helpers.h"
29
#include "util/u_math.h"
30
#include "util/u_memory.h"
31
#include "util/ralloc.h"
32
#include "util/hash_table.h"
33
#include "tgsi/tgsi_dump.h"
34
#include "tgsi/tgsi_parse.h"
35
#include "compiler/nir/nir.h"
36
#include "compiler/nir/nir_builder.h"
37
#include "compiler/nir_types.h"
38
#include "nir/tgsi_to_nir.h"
39
#include "vc4_context.h"
40
#include "vc4_qpu.h"
41
#include "vc4_qir.h"
42

43
static struct qreg
44
ntq_get_src(struct vc4_compile *c, nir_src src, int i);
45
static void
46
ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
47

48
static int
49
type_size(const struct glsl_type *type, bool bindless)
50
{
51
   return glsl_count_attribute_slots(type, false);
52
}
53

54
static void
55
resize_qreg_array(struct vc4_compile *c,
56
                  struct qreg **regs,
57
                  uint32_t *size,
58
                  uint32_t decl_size)
59
{
60
        if (*size >= decl_size)
61
                return;
62

63
        uint32_t old_size = *size;
64
        *size = MAX2(*size * 2, decl_size);
65
        *regs = reralloc(c, *regs, struct qreg, *size);
66
        if (!*regs) {
67
                fprintf(stderr, "Malloc failure\n");
68
                abort();
69
        }
70

71
        for (uint32_t i = old_size; i < *size; i++)
72
                (*regs)[i] = c->undef;
73
}
74

75
static void
76
ntq_emit_thrsw(struct vc4_compile *c)
77
{
78
        if (!c->fs_threaded)
79
                return;
80

81
        /* Always thread switch after each texture operation for now.
82
         *
83
         * We could do better by batching a bunch of texture fetches up and
84
         * then doing one thread switch and collecting all their results
85
         * afterward.
86
         */
87
        qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
88
                                    c->undef, c->undef));
89
        c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
90
}
91

92
static struct qreg
93
indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
94
{
95
        struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
96

97
        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
98
        uint32_t range = nir_intrinsic_range(intr);
99
        indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
100
        indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
101
                                        qir_uniform_ui(c, range - 4));
102

103
        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
104
                     indirect_offset,
105
                     qir_uniform(c, QUNIFORM_UBO0_ADDR,
106
                                 nir_intrinsic_base(intr)));
107

108
        c->num_texture_samples++;
109

110
        ntq_emit_thrsw(c);
111

112
        return qir_TEX_RESULT(c);
113
}
114

115
static struct qreg
116
vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
117
{
118
        ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);
119
        assert(buffer_index == 1);
120
        assert(c->stage == QSTAGE_FRAG);
121

122
        struct qreg offset = ntq_get_src(c, intr->src[1], 0);
123

124
        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
125
        offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
126
        offset = qir_MIN_NOIMM(c, offset,
127
                               qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
128

129
        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
130
                     offset,
131
                     qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
132

133
        c->num_texture_samples++;
134

135
        ntq_emit_thrsw(c);
136

137
        return qir_TEX_RESULT(c);
138
}
139

140
nir_ssa_def *
141
vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
142
{
143
        switch (swiz) {
144
        default:
145
        case PIPE_SWIZZLE_NONE:
146
                fprintf(stderr, "warning: unknown swizzle\n");
147
                FALLTHROUGH;
148
        case PIPE_SWIZZLE_0:
149
                return nir_imm_float(b, 0.0);
150
        case PIPE_SWIZZLE_1:
151
                return nir_imm_float(b, 1.0);
152
        case PIPE_SWIZZLE_X:
153
        case PIPE_SWIZZLE_Y:
154
        case PIPE_SWIZZLE_Z:
155
        case PIPE_SWIZZLE_W:
156
                return srcs[swiz];
157
        }
158
}
159

160
static struct qreg *
161
ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
162
{
163
        struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
164
                                          def->num_components);
165
        _mesa_hash_table_insert(c->def_ht, def, qregs);
166
        return qregs;
167
}
168

169
/**
170
 * This function is responsible for getting QIR results into the associated
171
 * storage for a NIR instruction.
172
 *
173
 * If it's a NIR SSA def, then we just set the associated hash table entry to
174
 * the new result.
175
 *
176
 * If it's a NIR reg, then we need to update the existing qreg assigned to the
177
 * NIR destination with the incoming value.  To do that without introducing
178
 * new MOVs, we require that the incoming qreg either be a uniform, or be
179
 * SSA-defined by the previous QIR instruction in the block and rewritable by
180
 * this function.  That lets us sneak ahead and insert the SF flag beforehand
181
 * (knowing that the previous instruction doesn't depend on flags) and rewrite
182
 * its destination to be the NIR reg's destination
183
 */
184
static void
185
ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
186
               struct qreg result)
187
{
188
        struct qinst *last_inst = NULL;
189
        if (!list_is_empty(&c->cur_block->instructions))
190
                last_inst = (struct qinst *)c->cur_block->instructions.prev;
191

192
        assert(result.file == QFILE_UNIF ||
193
               (result.file == QFILE_TEMP &&
194
                last_inst && last_inst == c->defs[result.index]));
195

196
        if (dest->is_ssa) {
197
                assert(chan < dest->ssa.num_components);
198

199
                struct qreg *qregs;
200
                struct hash_entry *entry =
201
                        _mesa_hash_table_search(c->def_ht, &dest->ssa);
202

203
                if (entry)
204
                        qregs = entry->data;
205
                else
206
                        qregs = ntq_init_ssa_def(c, &dest->ssa);
207

208
                qregs[chan] = result;
209
        } else {
210
                nir_register *reg = dest->reg.reg;
211
                assert(dest->reg.base_offset == 0);
212
                assert(reg->num_array_elems == 0);
213
                struct hash_entry *entry =
214
                        _mesa_hash_table_search(c->def_ht, reg);
215
                struct qreg *qregs = entry->data;
216

217
                /* Insert a MOV if the source wasn't an SSA def in the
218
                 * previous instruction.
219
                 */
220
                if (result.file == QFILE_UNIF) {
221
                        result = qir_MOV(c, result);
222
                        last_inst = c->defs[result.index];
223
                }
224

225
                /* We know they're both temps, so just rewrite index. */
226
                c->defs[last_inst->dst.index] = NULL;
227
                last_inst->dst.index = qregs[chan].index;
228

229
                /* If we're in control flow, then make this update of the reg
230
                 * conditional on the execution mask.
231
                 */
232
                if (c->execute.file != QFILE_NULL) {
233
                        last_inst->dst.index = qregs[chan].index;
234

235
                        /* Set the flags to the current exec mask.  To insert
236
                         * the SF, we temporarily remove our SSA instruction.
237
                         */
238
                        list_del(&last_inst->link);
239
                        qir_SF(c, c->execute);
240
                        list_addtail(&last_inst->link,
241
                                     &c->cur_block->instructions);
242

243
                        last_inst->cond = QPU_COND_ZS;
244
                        last_inst->cond_is_exec_mask = true;
245
                }
246
        }
247
}
248

249
static struct qreg *
250
ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
251
{
252
        if (dest->is_ssa) {
253
                struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
254
                for (int i = 0; i < dest->ssa.num_components; i++)
255
                        qregs[i] = c->undef;
256
                return qregs;
257
        } else {
258
                nir_register *reg = dest->reg.reg;
259
                assert(dest->reg.base_offset == 0);
260
                assert(reg->num_array_elems == 0);
261
                struct hash_entry *entry =
262
                        _mesa_hash_table_search(c->def_ht, reg);
263
                return entry->data;
264
        }
265
}
266

267
static struct qreg
268
ntq_get_src(struct vc4_compile *c, nir_src src, int i)
269
{
270
        struct hash_entry *entry;
271
        if (src.is_ssa) {
272
                entry = _mesa_hash_table_search(c->def_ht, src.ssa);
273
                assert(i < src.ssa->num_components);
274
        } else {
275
                nir_register *reg = src.reg.reg;
276
                entry = _mesa_hash_table_search(c->def_ht, reg);
277
                assert(reg->num_array_elems == 0);
278
                assert(src.reg.base_offset == 0);
279
                assert(i < reg->num_components);
280
        }
281

282
        struct qreg *qregs = entry->data;
283
        return qregs[i];
284
}
285

286
static struct qreg
287
ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
288
                unsigned src)
289
{
290
        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
291
        unsigned chan = ffs(instr->dest.write_mask) - 1;
292
        struct qreg r = ntq_get_src(c, instr->src[src].src,
293
                                    instr->src[src].swizzle[chan]);
294

295
        assert(!instr->src[src].abs);
296
        assert(!instr->src[src].negate);
297

298
        return r;
299
};
300

301
static inline struct qreg
302
qir_SAT(struct vc4_compile *c, struct qreg val)
303
{
304
        return qir_FMAX(c,
305
                        qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
306
                        qir_uniform_f(c, 0.0));
307
}
308

309
static struct qreg
310
ntq_rcp(struct vc4_compile *c, struct qreg x)
311
{
312
        struct qreg r = qir_RCP(c, x);
313

314
        /* Apply a Newton-Raphson step to improve the accuracy. */
315
        r = qir_FMUL(c, r, qir_FSUB(c,
316
                                    qir_uniform_f(c, 2.0),
317
                                    qir_FMUL(c, x, r)));
318

319
        return r;
320
}
321

322
static struct qreg
323
ntq_rsq(struct vc4_compile *c, struct qreg x)
324
{
325
        struct qreg r = qir_RSQ(c, x);
326

327
        /* Apply a Newton-Raphson step to improve the accuracy. */
328
        r = qir_FMUL(c, r, qir_FSUB(c,
329
                                    qir_uniform_f(c, 1.5),
330
                                    qir_FMUL(c,
331
                                             qir_uniform_f(c, 0.5),
332
                                             qir_FMUL(c, x,
333
                                                      qir_FMUL(c, r, r)))));
334

335
        return r;
336
}
337

338
static struct qreg
339
ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
340
{
341
        struct qreg src0_hi = qir_SHR(c, src0,
342
                                      qir_uniform_ui(c, 24));
343
        struct qreg src1_hi = qir_SHR(c, src1,
344
                                      qir_uniform_ui(c, 24));
345

346
        struct qreg hilo = qir_MUL24(c, src0_hi, src1);
347
        struct qreg lohi = qir_MUL24(c, src0, src1_hi);
348
        struct qreg lolo = qir_MUL24(c, src0, src1);
349

350
        return qir_ADD(c, lolo, qir_SHL(c,
351
                                        qir_ADD(c, hilo, lohi),
352
                                        qir_uniform_ui(c, 24)));
353
}
354

355
static struct qreg
356
ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
357
{
358
        struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
359
                                                 qir_uniform_ui(c, 8)));
360
        return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
361
}
362

363
/**
364
 * Emits a lowered TXF_MS from an MSAA texture.
365
 *
366
 * The addressing math has been lowered in NIR, and now we just need to read
367
 * it like a UBO.
368
 */
369
static void
370
ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
371
{
372
        uint32_t tile_width = 32;
373
        uint32_t tile_height = 32;
374
        uint32_t tile_size = (tile_height * tile_width *
375
                              VC4_MAX_SAMPLES * sizeof(uint32_t));
376

377
        unsigned unit = instr->texture_index;
378
        uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
379
        uint32_t w_tiles = w / tile_width;
380
        uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
381
        uint32_t h_tiles = h / tile_height;
382
        uint32_t size = w_tiles * h_tiles * tile_size;
383

384
        struct qreg addr;
385
        assert(instr->num_srcs == 1);
386
        assert(instr->src[0].src_type == nir_tex_src_coord);
387
        addr = ntq_get_src(c, instr->src[0].src, 0);
388

389
        /* Perform the clamping required by kernel validation. */
390
        addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
391
        addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
392

393
        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
394
                     addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
395

396
        ntq_emit_thrsw(c);
397

398
        struct qreg tex = qir_TEX_RESULT(c);
399
        c->num_texture_samples++;
400

401
        enum pipe_format format = c->key->tex[unit].format;
402
        if (util_format_is_depth_or_stencil(format)) {
403
                struct qreg scaled = ntq_scale_depth_texture(c, tex);
404
                for (int i = 0; i < 4; i++)
405
                        ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
406
        } else {
407
                for (int i = 0; i < 4; i++)
408
                        ntq_store_dest(c, &instr->dest, i,
409
                                       qir_UNPACK_8_F(c, tex, i));
410
        }
411
}
412

413
static void
414
ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
415
{
416
        struct qreg s, t, r, lod, compare;
417
        bool is_txb = false, is_txl = false;
418
        unsigned unit = instr->texture_index;
419

420
        if (instr->op == nir_texop_txf) {
421
                ntq_emit_txf(c, instr);
422
                return;
423
        }
424

425
        for (unsigned i = 0; i < instr->num_srcs; i++) {
426
                switch (instr->src[i].src_type) {
427
                case nir_tex_src_coord:
428
                        s = ntq_get_src(c, instr->src[i].src, 0);
429
                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
430
                                t = qir_uniform_f(c, 0.5);
431
                        else
432
                                t = ntq_get_src(c, instr->src[i].src, 1);
433
                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
434
                                r = ntq_get_src(c, instr->src[i].src, 2);
435
                        break;
436
                case nir_tex_src_bias:
437
                        lod = ntq_get_src(c, instr->src[i].src, 0);
438
                        is_txb = true;
439
                        break;
440
                case nir_tex_src_lod:
441
                        lod = ntq_get_src(c, instr->src[i].src, 0);
442
                        is_txl = true;
443
                        break;
444
                case nir_tex_src_comparator:
445
                        compare = ntq_get_src(c, instr->src[i].src, 0);
446
                        break;
447
                default:
448
                        unreachable("unknown texture source");
449
                }
450
        }
451

452
        if (c->stage != QSTAGE_FRAG && !is_txl) {
453
                /* From the GLSL 1.20 spec:
454
                 *
455
                 *     "If it is mip-mapped and running on the vertex shader,
456
                 *      then the base texture is used."
457
                 */
458
                is_txl = true;
459
                lod = qir_uniform_ui(c, 0);
460
        }
461

462
        if (c->key->tex[unit].force_first_level) {
463
                lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
464
                is_txl = true;
465
                is_txb = false;
466
        }
467

468
        struct qreg texture_u[] = {
469
                qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
470
                qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
471
                qir_uniform(c, QUNIFORM_CONSTANT, 0),
472
                qir_uniform(c, QUNIFORM_CONSTANT, 0),
473
        };
474
        uint32_t next_texture_u = 0;
475

476
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
477
                texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
478
                                           unit | (is_txl << 16));
479
        }
480

481
        struct qinst *tmu;
482
        if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
483
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
484
                tmu->src[qir_get_tex_uniform_src(tmu)] =
485
                        texture_u[next_texture_u++];
486
        } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
487
                   c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
488
                   c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
489
                   c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
490
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
491
                                   qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
492
                                               unit));
493
                tmu->src[qir_get_tex_uniform_src(tmu)] =
494
                        texture_u[next_texture_u++];
495
        }
496

497
        if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
498
                s = qir_SAT(c, s);
499
        }
500

501
        if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
502
                t = qir_SAT(c, t);
503
        }
504

505
        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
506
        tmu->src[qir_get_tex_uniform_src(tmu)] =
507
                texture_u[next_texture_u++];
508

509
        if (is_txl || is_txb) {
510
                tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
511
                tmu->src[qir_get_tex_uniform_src(tmu)] =
512
                        texture_u[next_texture_u++];
513
        }
514

515
        tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
516
        tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
517

518
        c->num_texture_samples++;
519

520
        ntq_emit_thrsw(c);
521

522
        struct qreg tex = qir_TEX_RESULT(c);
523

524
        enum pipe_format format = c->key->tex[unit].format;
525

526
        struct qreg *dest = ntq_get_dest(c, &instr->dest);
527
        if (util_format_is_depth_or_stencil(format)) {
528
                struct qreg normalized = ntq_scale_depth_texture(c, tex);
529
                struct qreg depth_output;
530

531
                struct qreg u0 = qir_uniform_f(c, 0.0f);
532
                struct qreg u1 = qir_uniform_f(c, 1.0f);
533
                if (c->key->tex[unit].compare_mode) {
534
                        /* From the GL_ARB_shadow spec:
535
                         *
536
                         *     "Let Dt (D subscript t) be the depth texture
537
                         *      value, in the range [0, 1].  Let R be the
538
                         *      interpolated texture coordinate clamped to the
539
                         *      range [0, 1]."
540
                         */
541
                        compare = qir_SAT(c, compare);
542

543
                        switch (c->key->tex[unit].compare_func) {
544
                        case PIPE_FUNC_NEVER:
545
                                depth_output = qir_uniform_f(c, 0.0f);
546
                                break;
547
                        case PIPE_FUNC_ALWAYS:
548
                                depth_output = u1;
549
                                break;
550
                        case PIPE_FUNC_EQUAL:
551
                                qir_SF(c, qir_FSUB(c, compare, normalized));
552
                                depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
553
                                break;
554
                        case PIPE_FUNC_NOTEQUAL:
555
                                qir_SF(c, qir_FSUB(c, compare, normalized));
556
                                depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
557
                                break;
558
                        case PIPE_FUNC_GREATER:
559
                                qir_SF(c, qir_FSUB(c, compare, normalized));
560
                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
561
                                break;
562
                        case PIPE_FUNC_GEQUAL:
563
                                qir_SF(c, qir_FSUB(c, normalized, compare));
564
                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
565
                                break;
566
                        case PIPE_FUNC_LESS:
567
                                qir_SF(c, qir_FSUB(c, compare, normalized));
568
                                depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
569
                                break;
570
                        case PIPE_FUNC_LEQUAL:
571
                                qir_SF(c, qir_FSUB(c, normalized, compare));
572
                                depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
573
                                break;
574
                        }
575
                } else {
576
                        depth_output = normalized;
577
                }
578

579
                for (int i = 0; i < 4; i++)
580
                        dest[i] = depth_output;
581
        } else {
582
                for (int i = 0; i < 4; i++)
583
                        dest[i] = qir_UNPACK_8_F(c, tex, i);
584
        }
585
}
586

587
/**
588
 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
589
 * to zero).
590
 */
591
static struct qreg
592
ntq_ffract(struct vc4_compile *c, struct qreg src)
593
{
594
        struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
595
        struct qreg diff = qir_FSUB(c, src, trunc);
596
        qir_SF(c, diff);
597

598
        qir_FADD_dest(c, diff,
599
                      diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
600

601
        return qir_MOV(c, diff);
602
}
603

604
/**
605
 * Computes floor(x), which is tricky because our FTOI truncates (rounds to
606
 * zero).
607
 */
608
static struct qreg
609
ntq_ffloor(struct vc4_compile *c, struct qreg src)
610
{
611
        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
612

613
        /* This will be < 0 if we truncated and the truncation was of a value
614
         * that was < 0 in the first place.
615
         */
616
        qir_SF(c, qir_FSUB(c, src, result));
617

618
        struct qinst *sub = qir_FSUB_dest(c, result,
619
                                          result, qir_uniform_f(c, 1.0));
620
        sub->cond = QPU_COND_NS;
621

622
        return qir_MOV(c, result);
623
}
624

625
/**
626
 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
627
 * zero).
628
 */
629
static struct qreg
630
ntq_fceil(struct vc4_compile *c, struct qreg src)
631
{
632
        struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
633

634
        /* This will be < 0 if we truncated and the truncation was of a value
635
         * that was > 0 in the first place.
636
         */
637
        qir_SF(c, qir_FSUB(c, result, src));
638

639
        qir_FADD_dest(c, result,
640
                      result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
641

642
        return qir_MOV(c, result);
643
}
644

645
static struct qreg
646
ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
647
{
648
        /* Since we're using a Taylor approximation, we want to have a small
649
         * number of coefficients and take advantage of sin/cos repeating
650
         * every 2pi.  We keep our x as close to 0 as we can, since the series
651
         * will be less accurate as |x| increases.  (Also, be careful of
652
         * shifting the input x value to be tricky with sin/cos relations,
653
         * because getting accurate values for x==0 is very important for SDL
654
         * rendering)
655
         */
656
        struct qreg scaled_x =
657
                qir_FMUL(c, x,
658
                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
659
        /* Note: FTOI truncates toward 0. */
660
        struct qreg x_frac = qir_FSUB(c, scaled_x,
661
                                      qir_ITOF(c, qir_FTOI(c, scaled_x)));
662
        /* Map [0.5, 1] to [-0.5, 0] */
663
        qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
664
        qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
665
        /* Map [-1, -0.5] to [0, 0.5] */
666
        qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
667
        qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
668

669
        return x_frac;
670
}
671

672
static struct qreg
673
ntq_fsin(struct vc4_compile *c, struct qreg src)
674
{
675
        float coeff[] = {
676
                2.0 * M_PI,
677
                -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
678
                pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
679
                -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
680
                pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
681
        };
682

683
        struct qreg x = ntq_shrink_sincos_input_range(c, src);
684
        struct qreg x2 = qir_FMUL(c, x, x);
685
        struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
686
        for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
687
                x = qir_FMUL(c, x, x2);
688
                sum = qir_FADD(c,
689
                               sum,
690
                               qir_FMUL(c,
691
                                        x,
692
                                        qir_uniform_f(c, coeff[i])));
693
        }
694
        return sum;
695
}
696

697
static struct qreg
698
ntq_fcos(struct vc4_compile *c, struct qreg src)
699
{
700
        float coeff[] = {
701
                1.0f,
702
                -pow(2.0 * M_PI, 2) / (2 * 1),
703
                pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
704
                -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
705
                pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
706
                -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
707
        };
708

709
        struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
710
        struct qreg sum = qir_uniform_f(c, coeff[0]);
711
        struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
712
        struct qreg x = x2; /* Current x^2, x^4, or x^6 */
713
        for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
714
                if (i != 1)
715
                        x = qir_FMUL(c, x, x2);
716

717
                sum = qir_FADD(c, qir_FMUL(c,
718
                                           x,
719
                                           qir_uniform_f(c, coeff[i])),
720
                               sum);
721
        }
722
        return sum;
723
}
724

725
static struct qreg
726
ntq_fsign(struct vc4_compile *c, struct qreg src)
727
{
728
        struct qreg t = qir_get_temp(c);
729

730
        qir_SF(c, src);
731
        qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
732
        qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
733
        qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
734
        return qir_MOV(c, t);
735
}
736

737
static void
738
emit_vertex_input(struct vc4_compile *c, int attr)
739
{
740
        enum pipe_format format = c->vs_key->attr_formats[attr];
741
        uint32_t attr_size = util_format_get_blocksize(format);
742

743
        c->vattr_sizes[attr] = align(attr_size, 4);
744
        for (int i = 0; i < align(attr_size, 4) / 4; i++) {
745
                c->inputs[attr * 4 + i] =
746
                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
747
                c->num_inputs++;
748
        }
749
}
750

751
static void
752
emit_fragcoord_input(struct vc4_compile *c, int attr)
753
{
754
        c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
755
        c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
756
        c->inputs[attr * 4 + 2] =
757
                qir_FMUL(c,
758
                         qir_ITOF(c, qir_FRAG_Z(c)),
759
                         qir_uniform_f(c, 1.0 / 0xffffff));
760
        c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
761
}
762

763
static struct qreg
764
emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
765
                      uint8_t swizzle)
766
{
767
        uint32_t i = c->num_input_slots++;
768
        struct qreg vary = {
769
                QFILE_VARY,
770
                i
771
        };
772

773
        if (c->num_input_slots >= c->input_slots_array_size) {
774
                c->input_slots_array_size =
775
                        MAX2(4, c->input_slots_array_size * 2);
776

777
                c->input_slots = reralloc(c, c->input_slots,
778
                                          struct vc4_varying_slot,
779
                                          c->input_slots_array_size);
780
        }
781

782
        c->input_slots[i].slot = slot;
783
        c->input_slots[i].swizzle = swizzle;
784

785
        return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
786
}
787

788
static void
789
emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
790
{
791
        for (int i = 0; i < 4; i++) {
792
                c->inputs[attr * 4 + i] =
793
                        emit_fragment_varying(c, slot, i);
794
                c->num_inputs++;
795
        }
796
}
797

798
static void
799
add_output(struct vc4_compile *c,
800
           uint32_t decl_offset,
801
           uint8_t slot,
802
           uint8_t swizzle)
803
{
804
        uint32_t old_array_size = c->outputs_array_size;
805
        resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
806
                          decl_offset + 1);
807

808
        if (old_array_size != c->outputs_array_size) {
809
                c->output_slots = reralloc(c,
810
                                           c->output_slots,
811
                                           struct vc4_varying_slot,
812
                                           c->outputs_array_size);
813
        }
814

815
        c->output_slots[decl_offset].slot = slot;
816
        c->output_slots[decl_offset].swizzle = swizzle;
817
}
818

819
static bool
820
ntq_src_is_only_ssa_def_user(nir_src *src)
821
{
822
        if (!src->is_ssa)
823
                return false;
824

825
        if (!list_is_empty(&src->ssa->if_uses))
826
                return false;
827

828
        return (src->ssa->uses.next == &src->use_link &&
829
                src->ssa->uses.next->next == &src->ssa->uses);
830
}
831

832
/**
833
 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
834
 * bit set.
835
 *
836
 * However, as an optimization, it tries to find the instructions generating
837
 * the sources to be packed and just emit the pack flag there, if possible.
838
 */
839
static void
840
ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
841
{
842
        struct qreg result = qir_get_temp(c);
843
        struct nir_alu_instr *vec4 = NULL;
844

845
        /* If packing from a vec4 op (as expected), identify it so that we can
846
         * peek back at what generated its sources.
847
         */
848
        if (instr->src[0].src.is_ssa &&
849
            instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
850
            nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
851
            nir_op_vec4) {
852
                vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
853
        }
854

855
        /* If the pack is replicating the same channel 4 times, use the 8888
856
         * pack flag.  This is common for blending using the alpha
857
         * channel.
858
         */
859
        if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
860
            instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
861
            instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
862
                struct qreg rep = ntq_get_src(c,
863
                                              instr->src[0].src,
864
                                              instr->src[0].swizzle[0]);
865
                ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
866
                return;
867
        }
868

869
        for (int i = 0; i < 4; i++) {
870
                int swiz = instr->src[0].swizzle[i];
871
                struct qreg src;
872
                if (vec4) {
873
                        src = ntq_get_src(c, vec4->src[swiz].src,
874
                                          vec4->src[swiz].swizzle[0]);
875
                } else {
876
                        src = ntq_get_src(c, instr->src[0].src, swiz);
877
                }
878

879
                if (vec4 &&
880
                    ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
881
                    src.file == QFILE_TEMP &&
882
                    c->defs[src.index] &&
883
                    qir_is_mul(c->defs[src.index]) &&
884
                    !c->defs[src.index]->dst.pack) {
885
                        struct qinst *rewrite = c->defs[src.index];
886
                        c->defs[src.index] = NULL;
887
                        rewrite->dst = result;
888
                        rewrite->dst.pack = QPU_PACK_MUL_8A + i;
889
                        continue;
890
                }
891

892
                qir_PACK_8_F(c, result, src, i);
893
        }
894

895
        ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
896
}
897

898
/** Handles sign-extended bitfield extracts for 16 bits. */
899
static struct qreg
900
ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
901
              struct qreg bits)
902
{
903
        assert(bits.file == QFILE_UNIF &&
904
               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
905
               c->uniform_data[bits.index] == 16);
906

907
        assert(offset.file == QFILE_UNIF &&
908
               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
909
        int offset_bit = c->uniform_data[offset.index];
910
        assert(offset_bit % 16 == 0);
911

912
        return qir_UNPACK_16_I(c, base, offset_bit / 16);
913
}
914

915
/** Handles unsigned bitfield extracts for 8 bits. */
916
static struct qreg
917
ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
918
              struct qreg bits)
919
{
920
        assert(bits.file == QFILE_UNIF &&
921
               c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
922
               c->uniform_data[bits.index] == 8);
923

924
        assert(offset.file == QFILE_UNIF &&
925
               c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
926
        int offset_bit = c->uniform_data[offset.index];
927
        assert(offset_bit % 8 == 0);
928

929
        return qir_UNPACK_8_I(c, base, offset_bit / 8);
930
}
931

932
/**
933
 * If compare_instr is a valid comparison instruction, emits the
934
 * compare_instr's comparison and returns the sel_instr's return value based
935
 * on the compare_instr's result.
936
 */
937
static bool
938
ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
939
                    nir_alu_instr *compare_instr,
940
                    nir_alu_instr *sel_instr)
941
{
942
        enum qpu_cond cond;
943

944
        switch (compare_instr->op) {
945
        case nir_op_feq32:
946
        case nir_op_ieq32:
947
        case nir_op_seq:
948
                cond = QPU_COND_ZS;
949
                break;
950
        case nir_op_fneu32:
951
        case nir_op_ine32:
952
        case nir_op_sne:
953
                cond = QPU_COND_ZC;
954
                break;
955
        case nir_op_fge32:
956
        case nir_op_ige32:
957
        case nir_op_uge32:
958
        case nir_op_sge:
959
                cond = QPU_COND_NC;
960
                break;
961
        case nir_op_flt32:
962
        case nir_op_ilt32:
963
        case nir_op_slt:
964
                cond = QPU_COND_NS;
965
                break;
966
        default:
967
                return false;
968
        }
969

970
        struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
971
        struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
972

973
        unsigned unsized_type =
974
                nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
975
        if (unsized_type == nir_type_float)
976
                qir_SF(c, qir_FSUB(c, src0, src1));
977
        else
978
                qir_SF(c, qir_SUB(c, src0, src1));
979

980
        switch (sel_instr->op) {
981
        case nir_op_seq:
982
        case nir_op_sne:
983
        case nir_op_sge:
984
        case nir_op_slt:
985
                *dest = qir_SEL(c, cond,
986
                                qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
987
                break;
988

989
        case nir_op_b32csel:
990
                *dest = qir_SEL(c, cond,
991
                                ntq_get_alu_src(c, sel_instr, 1),
992
                                ntq_get_alu_src(c, sel_instr, 2));
993
                break;
994

995
        default:
996
                *dest = qir_SEL(c, cond,
997
                                qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
998
                break;
999
        }
1000

1001
        /* Make the temporary for nir_store_dest(). */
1002
        *dest = qir_MOV(c, *dest);
1003

1004
        return true;
1005
}
1006

1007
/**
1008
 * Attempts to fold a comparison generating a boolean result into the
1009
 * condition code for selecting between two values, instead of comparing the
1010
 * boolean result against 0 to generate the condition code.
1011
 */
1012
static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
1013
                                  struct qreg *src)
1014
{
1015
        if (!instr->src[0].src.is_ssa)
1016
                goto out;
1017
        if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
1018
                goto out;
1019
        nir_alu_instr *compare =
1020
                nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1021
        if (!compare)
1022
                goto out;
1023

1024
        struct qreg dest;
1025
        if (ntq_emit_comparison(c, &dest, compare, instr))
1026
                return dest;
1027

1028
out:
1029
        qir_SF(c, src[0]);
1030
        return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
1031
}
1032

1033
static struct qreg
1034
ntq_fddx(struct vc4_compile *c, struct qreg src)
1035
{
1036
        /* Make sure that we have a bare temp to use for MUL rotation, so it
1037
         * can be allocated to an accumulator.
1038
         */
1039
        if (src.pack || src.file != QFILE_TEMP)
1040
                src = qir_MOV(c, src);
1041

1042
        struct qreg from_left = qir_ROT_MUL(c, src, 1);
1043
        struct qreg from_right = qir_ROT_MUL(c, src, 15);
1044

1045
        /* Distinguish left/right pixels of the quad. */
1046
        qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
1047
                          qir_uniform_ui(c, 1)));
1048

1049
        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1050
                                  qir_FSUB(c, from_right, src),
1051
                                  qir_FSUB(c, src, from_left)));
1052
}
1053

1054
static struct qreg
1055
ntq_fddy(struct vc4_compile *c, struct qreg src)
1056
{
1057
        if (src.pack || src.file != QFILE_TEMP)
1058
                src = qir_MOV(c, src);
1059

1060
        struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
1061
        struct qreg from_top = qir_ROT_MUL(c, src, 14);
1062

1063
        /* Distinguish top/bottom pixels of the quad. */
1064
        qir_SF(c, qir_AND(c,
1065
                          qir_reg(QFILE_QPU_ELEMENT, 0),
1066
                          qir_uniform_ui(c, 2)));
1067

1068
        return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1069
                                  qir_FSUB(c, from_top, src),
1070
                                  qir_FSUB(c, src, from_bottom)));
1071
}
1072

1073
static void
1074
ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
1075
{
1076
        /* This should always be lowered to ALU operations for VC4. */
1077
        assert(!instr->dest.saturate);
1078

1079
        /* Vectors are special in that they have non-scalarized writemasks,
1080
         * and just take the first swizzle channel for each argument in order
1081
         * into each writemask channel.
1082
         */
1083
        if (instr->op == nir_op_vec2 ||
1084
            instr->op == nir_op_vec3 ||
1085
            instr->op == nir_op_vec4) {
1086
                struct qreg srcs[4];
1087
                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1088
                        srcs[i] = ntq_get_src(c, instr->src[i].src,
1089
                                              instr->src[i].swizzle[0]);
1090
                for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1091
                        ntq_store_dest(c, &instr->dest.dest, i,
1092
                                       qir_MOV(c, srcs[i]));
1093
                return;
1094
        }
1095

1096
        if (instr->op == nir_op_pack_unorm_4x8) {
1097
                ntq_emit_pack_unorm_4x8(c, instr);
1098
                return;
1099
        }
1100

1101
        if (instr->op == nir_op_unpack_unorm_4x8) {
1102
                struct qreg src = ntq_get_src(c, instr->src[0].src,
1103
                                              instr->src[0].swizzle[0]);
1104
                for (int i = 0; i < 4; i++) {
1105
                        if (instr->dest.write_mask & (1 << i))
1106
                                ntq_store_dest(c, &instr->dest.dest, i,
1107
                                               qir_UNPACK_8_F(c, src, i));
1108
                }
1109
                return;
1110
        }
1111

1112
        /* General case: We can just grab the one used channel per src. */
1113
        struct qreg src[nir_op_infos[instr->op].num_inputs];
1114
        for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1115
                src[i] = ntq_get_alu_src(c, instr, i);
1116
        }
1117

1118
        struct qreg result;
1119

1120
        switch (instr->op) {
1121
        case nir_op_mov:
1122
                result = qir_MOV(c, src[0]);
1123
                break;
1124
        case nir_op_fmul:
1125
                result = qir_FMUL(c, src[0], src[1]);
1126
                break;
1127
        case nir_op_fadd:
1128
                result = qir_FADD(c, src[0], src[1]);
1129
                break;
1130
        case nir_op_fsub:
1131
                result = qir_FSUB(c, src[0], src[1]);
1132
                break;
1133
        case nir_op_fmin:
1134
                result = qir_FMIN(c, src[0], src[1]);
1135
                break;
1136
        case nir_op_fmax:
1137
                result = qir_FMAX(c, src[0], src[1]);
1138
                break;
1139

1140
        case nir_op_f2i32:
1141
        case nir_op_f2u32:
1142
                result = qir_FTOI(c, src[0]);
1143
                break;
1144
        case nir_op_i2f32:
1145
        case nir_op_u2f32:
1146
                result = qir_ITOF(c, src[0]);
1147
                break;
1148
        case nir_op_b2f32:
1149
                result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
1150
                break;
1151
        case nir_op_b2i32:
1152
                result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
1153
                break;
1154
        case nir_op_i2b32:
1155
        case nir_op_f2b32:
1156
                qir_SF(c, src[0]);
1157
                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
1158
                                            qir_uniform_ui(c, ~0),
1159
                                            qir_uniform_ui(c, 0)));
1160
                break;
1161

1162
        case nir_op_iadd:
1163
                result = qir_ADD(c, src[0], src[1]);
1164
                break;
1165
        case nir_op_ushr:
1166
                result = qir_SHR(c, src[0], src[1]);
1167
                break;
1168
        case nir_op_isub:
1169
                result = qir_SUB(c, src[0], src[1]);
1170
                break;
1171
        case nir_op_ishr:
1172
                result = qir_ASR(c, src[0], src[1]);
1173
                break;
1174
        case nir_op_ishl:
1175
                result = qir_SHL(c, src[0], src[1]);
1176
                break;
1177
        case nir_op_imin:
1178
                result = qir_MIN(c, src[0], src[1]);
1179
                break;
1180
        case nir_op_imax:
1181
                result = qir_MAX(c, src[0], src[1]);
1182
                break;
1183
        case nir_op_iand:
1184
                result = qir_AND(c, src[0], src[1]);
1185
                break;
1186
        case nir_op_ior:
1187
                result = qir_OR(c, src[0], src[1]);
1188
                break;
1189
        case nir_op_ixor:
1190
                result = qir_XOR(c, src[0], src[1]);
1191
                break;
1192
        case nir_op_inot:
1193
                result = qir_NOT(c, src[0]);
1194
                break;
1195

1196
        case nir_op_imul:
1197
                result = ntq_umul(c, src[0], src[1]);
1198
                break;
1199

1200
        case nir_op_seq:
1201
        case nir_op_sne:
1202
        case nir_op_sge:
1203
        case nir_op_slt:
1204
        case nir_op_feq32:
1205
        case nir_op_fneu32:
1206
        case nir_op_fge32:
1207
        case nir_op_flt32:
1208
        case nir_op_ieq32:
1209
        case nir_op_ine32:
1210
        case nir_op_ige32:
1211
        case nir_op_uge32:
1212
        case nir_op_ilt32:
1213
                if (!ntq_emit_comparison(c, &result, instr, instr)) {
1214
                        fprintf(stderr, "Bad comparison instruction\n");
1215
                }
1216
                break;
1217

1218
        case nir_op_b32csel:
1219
                result = ntq_emit_bcsel(c, instr, src);
1220
                break;
1221
        case nir_op_fcsel:
1222
                qir_SF(c, src[0]);
1223
                result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
1224
                break;
1225

1226
        case nir_op_frcp:
1227
                result = ntq_rcp(c, src[0]);
1228
                break;
1229
        case nir_op_frsq:
1230
                result = ntq_rsq(c, src[0]);
1231
                break;
1232
        case nir_op_fexp2:
1233
                result = qir_EXP2(c, src[0]);
1234
                break;
1235
        case nir_op_flog2:
1236
                result = qir_LOG2(c, src[0]);
1237
                break;
1238

1239
        case nir_op_ftrunc:
1240
                result = qir_ITOF(c, qir_FTOI(c, src[0]));
1241
                break;
1242
        case nir_op_fceil:
1243
                result = ntq_fceil(c, src[0]);
1244
                break;
1245
        case nir_op_ffract:
1246
                result = ntq_ffract(c, src[0]);
1247
                break;
1248
        case nir_op_ffloor:
1249
                result = ntq_ffloor(c, src[0]);
1250
                break;
1251

1252
        case nir_op_fsin:
1253
                result = ntq_fsin(c, src[0]);
1254
                break;
1255
        case nir_op_fcos:
1256
                result = ntq_fcos(c, src[0]);
1257
                break;
1258

1259
        case nir_op_fsign:
1260
                result = ntq_fsign(c, src[0]);
1261
                break;
1262

1263
        case nir_op_fabs:
1264
                result = qir_FMAXABS(c, src[0], src[0]);
1265
                break;
1266
        case nir_op_iabs:
1267
                result = qir_MAX(c, src[0],
1268
                                qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1269
                break;
1270

1271
        case nir_op_ibitfield_extract:
1272
                result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
1273
                break;
1274

1275
        case nir_op_ubitfield_extract:
1276
                result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
1277
                break;
1278

1279
        case nir_op_usadd_4x8_vc4:
1280
                result = qir_V8ADDS(c, src[0], src[1]);
1281
                break;
1282

1283
        case nir_op_ussub_4x8_vc4:
1284
                result = qir_V8SUBS(c, src[0], src[1]);
1285
                break;
1286

1287
        case nir_op_umin_4x8_vc4:
1288
                result = qir_V8MIN(c, src[0], src[1]);
1289
                break;
1290

1291
        case nir_op_umax_4x8_vc4:
1292
                result = qir_V8MAX(c, src[0], src[1]);
1293
                break;
1294

1295
        case nir_op_umul_unorm_4x8_vc4:
1296
                result = qir_V8MULD(c, src[0], src[1]);
1297
                break;
1298

1299
        case nir_op_fddx:
1300
        case nir_op_fddx_coarse:
1301
        case nir_op_fddx_fine:
1302
                result = ntq_fddx(c, src[0]);
1303
                break;
1304

1305
        case nir_op_fddy:
1306
        case nir_op_fddy_coarse:
1307
        case nir_op_fddy_fine:
1308
                result = ntq_fddy(c, src[0]);
1309
                break;
1310

1311
        default:
1312
                fprintf(stderr, "unknown NIR ALU inst: ");
1313
                nir_print_instr(&instr->instr, stderr);
1314
                fprintf(stderr, "\n");
1315
                abort();
1316
        }
1317

1318
        /* We have a scalar result, so the instruction should only have a
1319
         * single channel written to.
1320
         */
1321
        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1322
        ntq_store_dest(c, &instr->dest.dest,
1323
                       ffs(instr->dest.write_mask) - 1, result);
1324
}
1325

1326
static void
1327
emit_frag_end(struct vc4_compile *c)
1328
{
1329
        struct qreg color;
1330
        if (c->output_color_index != -1) {
1331
                color = c->outputs[c->output_color_index];
1332
        } else {
1333
                color = qir_uniform_ui(c, 0);
1334
        }
1335

1336
        uint32_t discard_cond = QPU_COND_ALWAYS;
1337
        if (c->s->info.fs.uses_discard) {
1338
                qir_SF(c, c->discard);
1339
                discard_cond = QPU_COND_ZS;
1340
        }
1341

1342
        if (c->fs_key->stencil_enabled) {
1343
                qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1344
                             qir_uniform(c, QUNIFORM_STENCIL, 0));
1345
                if (c->fs_key->stencil_twoside) {
1346
                        qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1347
                                     qir_uniform(c, QUNIFORM_STENCIL, 1));
1348
                }
1349
                if (c->fs_key->stencil_full_writemasks) {
1350
                        qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1351
                                     qir_uniform(c, QUNIFORM_STENCIL, 2));
1352
                }
1353
        }
1354

1355
        if (c->output_sample_mask_index != -1) {
1356
                qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1357
        }
1358

1359
        if (c->fs_key->depth_enabled) {
1360
                if (c->output_position_index != -1) {
1361
                        qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1362
                                      qir_FMUL(c,
1363
                                               c->outputs[c->output_position_index],
1364
                                               qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
1365
                } else {
1366
                        qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1367
                                     qir_FRAG_Z(c))->cond = discard_cond;
1368
                }
1369
        }
1370

1371
        if (!c->msaa_per_sample_output) {
1372
                qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
1373
                             color)->cond = discard_cond;
1374
        } else {
1375
                for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
1376
                        qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
1377
                                     c->sample_colors[i])->cond = discard_cond;
1378
                }
1379
        }
1380
}
1381

1382
static void
1383
emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1384
{
1385
        struct qreg packed = qir_get_temp(c);
1386

1387
        for (int i = 0; i < 2; i++) {
1388
                struct qreg scale =
1389
                        qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1390

1391
                struct qreg packed_chan = packed;
1392
                packed_chan.pack = QPU_PACK_A_16A + i;
1393

1394
                qir_FTOI_dest(c, packed_chan,
1395
                              qir_FMUL(c,
1396
                                       qir_FMUL(c,
1397
                                                c->outputs[c->output_position_index + i],
1398
                                                scale),
1399
                                       rcp_w));
1400
        }
1401

1402
        qir_VPM_WRITE(c, packed);
1403
}
1404

1405
static void
1406
emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1407
{
1408
        struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1409
        struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1410

1411
        qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1412
                                                          c->outputs[c->output_position_index + 2],
1413
                                                          zscale),
1414
                                              rcp_w),
1415
                                  zoffset));
1416
}
1417

1418
static void
1419
emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1420
{
1421
        qir_VPM_WRITE(c, rcp_w);
1422
}
1423

1424
static void
1425
emit_point_size_write(struct vc4_compile *c)
1426
{
1427
        struct qreg point_size;
1428

1429
        if (c->output_point_size_index != -1)
1430
                point_size = c->outputs[c->output_point_size_index];
1431
        else
1432
                point_size = qir_uniform_f(c, 1.0);
1433

1434
        qir_VPM_WRITE(c, point_size);
1435
}
1436

1437
/**
1438
 * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1439
 *
1440
 * The simulator insists that there be at least one vertex attribute, so
1441
 * vc4_draw.c will emit one if it wouldn't have otherwise.  The simulator also
1442
 * insists that all vertex attributes loaded get read by the VS/CS, so we have
1443
 * to consume it here.
1444
 */
1445
static void
1446
emit_stub_vpm_read(struct vc4_compile *c)
1447
{
1448
        if (c->num_inputs)
1449
                return;
1450

1451
        c->vattr_sizes[0] = 4;
1452
        (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
1453
        c->num_inputs++;
1454
}
1455

1456
static void
1457
emit_vert_end(struct vc4_compile *c,
1458
              struct vc4_varying_slot *fs_inputs,
1459
              uint32_t num_fs_inputs)
1460
{
1461
        struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1462

1463
        emit_stub_vpm_read(c);
1464

1465
        emit_scaled_viewport_write(c, rcp_w);
1466
        emit_zs_write(c, rcp_w);
1467
        emit_rcp_wc_write(c, rcp_w);
1468
        if (c->vs_key->per_vertex_point_size)
1469
                emit_point_size_write(c);
1470

1471
        for (int i = 0; i < num_fs_inputs; i++) {
1472
                struct vc4_varying_slot *input = &fs_inputs[i];
1473
                int j;
1474

1475
                for (j = 0; j < c->num_outputs; j++) {
1476
                        struct vc4_varying_slot *output =
1477
                                &c->output_slots[j];
1478

1479
                        if (input->slot == output->slot &&
1480
                            input->swizzle == output->swizzle) {
1481
                                qir_VPM_WRITE(c, c->outputs[j]);
1482
                                break;
1483
                        }
1484
                }
1485
                /* Emit padding if we didn't find a declared VS output for
1486
                 * this FS input.
1487
                 */
1488
                if (j == c->num_outputs)
1489
                        qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1490
        }
1491
}
1492

1493
static void
1494
emit_coord_end(struct vc4_compile *c)
1495
{
1496
        struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1497

1498
        emit_stub_vpm_read(c);
1499

1500
        for (int i = 0; i < 4; i++)
1501
                qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1502

1503
        emit_scaled_viewport_write(c, rcp_w);
1504
        emit_zs_write(c, rcp_w);
1505
        emit_rcp_wc_write(c, rcp_w);
1506
        if (c->vs_key->per_vertex_point_size)
1507
                emit_point_size_write(c);
1508
}
1509

1510
static void
1511
vc4_optimize_nir(struct nir_shader *s)
1512
{
1513
        bool progress;
1514
        unsigned lower_flrp =
1515
                (s->options->lower_flrp16 ? 16 : 0) |
1516
                (s->options->lower_flrp32 ? 32 : 0) |
1517
                (s->options->lower_flrp64 ? 64 : 0);
1518

1519
        do {
1520
                progress = false;
1521

1522
                NIR_PASS_V(s, nir_lower_vars_to_ssa);
1523
                NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1524
                NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1525
                NIR_PASS(progress, s, nir_copy_prop);
1526
                NIR_PASS(progress, s, nir_opt_remove_phis);
1527
                NIR_PASS(progress, s, nir_opt_dce);
1528
                NIR_PASS(progress, s, nir_opt_dead_cf);
1529
                NIR_PASS(progress, s, nir_opt_cse);
1530
                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1531
                NIR_PASS(progress, s, nir_opt_algebraic);
1532
                NIR_PASS(progress, s, nir_opt_constant_folding);
1533
                if (lower_flrp != 0) {
1534
                        bool lower_flrp_progress = false;
1535

1536
                        NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1537
                                 lower_flrp,
1538
                                 false /* always_precise */);
1539
                        if (lower_flrp_progress) {
1540
                                NIR_PASS(progress, s, nir_opt_constant_folding);
1541
                                progress = true;
1542
                        }
1543

1544
                        /* Nothing should rematerialize any flrps, so we only
1545
                         * need to do this lowering once.
1546
                         */
1547
                        lower_flrp = 0;
1548
                }
1549

1550
                NIR_PASS(progress, s, nir_opt_undef);
1551
                NIR_PASS(progress, s, nir_opt_loop_unroll,
1552
                         nir_var_shader_in |
1553
                         nir_var_shader_out |
1554
                         nir_var_function_temp);
1555
        } while (progress);
1556
}
1557

1558
static int
1559
driver_location_compare(const void *in_a, const void *in_b)
1560
{
1561
        const nir_variable *const *a = in_a;
1562
        const nir_variable *const *b = in_b;
1563

1564
        return (*a)->data.driver_location - (*b)->data.driver_location;
1565
}
1566

1567
static void
1568
ntq_setup_inputs(struct vc4_compile *c)
1569
{
1570
        unsigned num_entries = 0;
1571
        nir_foreach_shader_in_variable(var, c->s)
1572
                num_entries++;
1573

1574
        nir_variable *vars[num_entries];
1575

1576
        unsigned i = 0;
1577
        nir_foreach_shader_in_variable(var, c->s)
1578
                vars[i++] = var;
1579

1580
        /* Sort the variables so that we emit the input setup in
1581
         * driver_location order.  This is required for VPM reads, whose data
1582
         * is fetched into the VPM in driver_location (TGSI register index)
1583
         * order.
1584
         */
1585
        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1586

1587
        for (unsigned i = 0; i < num_entries; i++) {
1588
                nir_variable *var = vars[i];
1589
                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1590
                unsigned loc = var->data.driver_location;
1591

1592
                assert(array_len == 1);
1593
                (void)array_len;
1594
                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1595
                                  (loc + 1) * 4);
1596

1597
                if (c->stage == QSTAGE_FRAG) {
1598
                        if (var->data.location == VARYING_SLOT_POS) {
1599
                                emit_fragcoord_input(c, loc);
1600
                        } else if (util_varying_is_point_coord(var->data.location,
1601
                                                               c->fs_key->point_sprite_mask)) {
1602
                                c->inputs[loc * 4 + 0] = c->point_x;
1603
                                c->inputs[loc * 4 + 1] = c->point_y;
1604
                        } else {
1605
                                emit_fragment_input(c, loc, var->data.location);
1606
                        }
1607
                } else {
1608
                        emit_vertex_input(c, loc);
1609
                }
1610
        }
1611
}
1612

1613
static void
1614
ntq_setup_outputs(struct vc4_compile *c)
1615
{
1616
        nir_foreach_shader_out_variable(var, c->s) {
1617
                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1618
                unsigned loc = var->data.driver_location * 4;
1619

1620
                assert(array_len == 1);
1621
                (void)array_len;
1622

1623
                for (int i = 0; i < 4; i++)
1624
                        add_output(c, loc + i, var->data.location, i);
1625

1626
                if (c->stage == QSTAGE_FRAG) {
1627
                        switch (var->data.location) {
1628
                        case FRAG_RESULT_COLOR:
1629
                        case FRAG_RESULT_DATA0:
1630
                                c->output_color_index = loc;
1631
                                break;
1632
                        case FRAG_RESULT_DEPTH:
1633
                                c->output_position_index = loc;
1634
                                break;
1635
                        case FRAG_RESULT_SAMPLE_MASK:
1636
                                c->output_sample_mask_index = loc;
1637
                                break;
1638
                        }
1639
                } else {
1640
                        switch (var->data.location) {
1641
                        case VARYING_SLOT_POS:
1642
                                c->output_position_index = loc;
1643
                                break;
1644
                        case VARYING_SLOT_PSIZ:
1645
                                c->output_point_size_index = loc;
1646
                                break;
1647
                        }
1648
                }
1649
        }
1650
}
1651

1652
/**
1653
 * Sets up the mapping from nir_register to struct qreg *.
1654
 *
1655
 * Each nir_register gets a struct qreg per 32-bit component being stored.
1656
 */
1657
static void
1658
ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
1659
{
1660
        foreach_list_typed(nir_register, nir_reg, node, list) {
1661
                unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1662
                struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1663
                                                  array_len *
1664
                                                  nir_reg->num_components);
1665

1666
                _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1667

1668
                for (int i = 0; i < array_len * nir_reg->num_components; i++)
1669
                        qregs[i] = qir_get_temp(c);
1670
        }
1671
}
1672

1673
static void
1674
ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1675
{
1676
        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1677
        for (int i = 0; i < instr->def.num_components; i++)
1678
                qregs[i] = qir_uniform_ui(c, instr->value[i].u32);
1679

1680
        _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1681
}
1682

1683
static void
1684
ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
1685
{
1686
        struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1687

1688
        /* QIR needs there to be *some* value, so pick 0 (same as for
1689
         * ntq_setup_registers().
1690
         */
1691
        for (int i = 0; i < instr->def.num_components; i++)
1692
                qregs[i] = qir_uniform_ui(c, 0);
1693
}
1694

1695
static void
1696
ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
1697
{
1698
        assert(nir_src_as_uint(instr->src[0]) == 0);
1699

1700
        /* Reads of the per-sample color need to be done in
1701
         * order.
1702
         */
1703
        int sample_index = (nir_intrinsic_base(instr) -
1704
                            VC4_NIR_TLB_COLOR_READ_INPUT);
1705
        for (int i = 0; i <= sample_index; i++) {
1706
                if (c->color_reads[i].file == QFILE_NULL) {
1707
                        c->color_reads[i] =
1708
                                qir_TLB_COLOR_READ(c);
1709
                }
1710
        }
1711
        ntq_store_dest(c, &instr->dest, 0,
1712
                       qir_MOV(c, c->color_reads[sample_index]));
1713
}
1714

1715
static void
1716
ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
1717
{
1718
        assert(instr->num_components == 1);
1719
        assert(nir_src_is_const(instr->src[0]) &&
1720
               "vc4 doesn't support indirect inputs");
1721

1722
        if (c->stage == QSTAGE_FRAG &&
1723
            nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
1724
                ntq_emit_color_read(c, instr);
1725
                return;
1726
        }
1727

1728
        uint32_t offset = nir_intrinsic_base(instr) +
1729
                          nir_src_as_uint(instr->src[0]);
1730
        int comp = nir_intrinsic_component(instr);
1731
        ntq_store_dest(c, &instr->dest, 0,
1732
                       qir_MOV(c, c->inputs[offset * 4 + comp]));
1733
}
1734

1735
static void
1736
ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1737
{
1738
        unsigned offset;
1739

1740
        switch (instr->intrinsic) {
1741
        case nir_intrinsic_load_uniform:
1742
                assert(instr->num_components == 1);
1743
                if (nir_src_is_const(instr->src[0])) {
1744
                        offset = nir_intrinsic_base(instr) +
1745
                                 nir_src_as_uint(instr->src[0]);
1746
                        assert(offset % 4 == 0);
1747
                        /* We need dwords */
1748
                        offset = offset / 4;
1749
                        ntq_store_dest(c, &instr->dest, 0,
1750
                                       qir_uniform(c, QUNIFORM_UNIFORM,
1751
                                                   offset));
1752
                } else {
1753
                        ntq_store_dest(c, &instr->dest, 0,
1754
                                       indirect_uniform_load(c, instr));
1755
                }
1756
                break;
1757

1758
        case nir_intrinsic_load_ubo:
1759
                assert(instr->num_components == 1);
1760
                ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr));
1761
                break;
1762

1763
        case nir_intrinsic_load_user_clip_plane:
1764
                for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
1765
                        ntq_store_dest(c, &instr->dest, i,
1766
                                       qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1767
                                                   nir_intrinsic_ucp_id(instr) *
1768
                                                   4 + i));
1769
                }
1770
                break;
1771

1772
        case nir_intrinsic_load_blend_const_color_r_float:
1773
        case nir_intrinsic_load_blend_const_color_g_float:
1774
        case nir_intrinsic_load_blend_const_color_b_float:
1775
        case nir_intrinsic_load_blend_const_color_a_float:
1776
                ntq_store_dest(c, &instr->dest, 0,
1777
                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
1778
                                           (instr->intrinsic -
1779
                                            nir_intrinsic_load_blend_const_color_r_float),
1780
                                           0));
1781
                break;
1782

1783
        case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
1784
                ntq_store_dest(c, &instr->dest, 0,
1785
                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
1786
                                           0));
1787
                break;
1788

1789
        case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
1790
                ntq_store_dest(c, &instr->dest, 0,
1791
                               qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
1792
                                           0));
1793
                break;
1794

1795
        case nir_intrinsic_load_sample_mask_in:
1796
                ntq_store_dest(c, &instr->dest, 0,
1797
                               qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1798
                break;
1799

1800
        case nir_intrinsic_load_front_face:
1801
                /* The register contains 0 (front) or 1 (back), and we need to
1802
                 * turn it into a NIR bool where true means front.
1803
                 */
1804
                ntq_store_dest(c, &instr->dest, 0,
1805
                               qir_ADD(c,
1806
                                       qir_uniform_ui(c, -1),
1807
                                       qir_reg(QFILE_FRAG_REV_FLAG, 0)));
1808
                break;
1809

1810
        case nir_intrinsic_load_input:
1811
                ntq_emit_load_input(c, instr);
1812
                break;
1813

1814
        case nir_intrinsic_store_output:
1815
                assert(nir_src_is_const(instr->src[1]) &&
1816
                       "vc4 doesn't support indirect outputs");
1817
                offset = nir_intrinsic_base(instr) +
1818
                         nir_src_as_uint(instr->src[1]);
1819

1820
                /* MSAA color outputs are the only case where we have an
1821
                 * output that's not lowered to being a store of a single 32
1822
                 * bit value.
1823
                 */
1824
                if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
1825
                        assert(offset == c->output_color_index);
1826
                        for (int i = 0; i < 4; i++) {
1827
                                c->sample_colors[i] =
1828
                                        qir_MOV(c, ntq_get_src(c, instr->src[0],
1829
                                                               i));
1830
                        }
1831
                } else {
1832
                        offset = offset * 4 + nir_intrinsic_component(instr);
1833
                        assert(instr->num_components == 1);
1834
                        c->outputs[offset] =
1835
                                qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
1836
                        c->num_outputs = MAX2(c->num_outputs, offset + 1);
1837
                }
1838
                break;
1839

1840
        case nir_intrinsic_discard:
1841
                if (c->execute.file != QFILE_NULL) {
1842
                        qir_SF(c, c->execute);
1843
                        qir_MOV_cond(c, QPU_COND_ZS, c->discard,
1844
                                     qir_uniform_ui(c, ~0));
1845
                } else {
1846
                        qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
1847
                }
1848
                break;
1849

1850
        case nir_intrinsic_discard_if: {
1851
                /* true (~0) if we're discarding */
1852
                struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1853

1854
                if (c->execute.file != QFILE_NULL) {
1855
                        /* execute == 0 means the channel is active.  Invert
1856
                         * the condition so that we can use zero as "executing
1857
                         * and discarding."
1858
                         */
1859
                        qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
1860
                        qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
1861
                } else {
1862
                        qir_OR_dest(c, c->discard, c->discard,
1863
                                    ntq_get_src(c, instr->src[0], 0));
1864
                }
1865

1866
                break;
1867
        }
1868

1869
        case nir_intrinsic_load_texture_rect_scaling: {
1870
                assert(nir_src_is_const(instr->src[0]));
1871
                int sampler = nir_src_as_int(instr->src[0]);
1872

1873
                ntq_store_dest(c, &instr->dest, 0,
1874
                                qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));
1875
                ntq_store_dest(c, &instr->dest, 1,
1876
                                qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));
1877
                break;
1878
        }
1879

1880
        default:
1881
                fprintf(stderr, "Unknown intrinsic: ");
1882
                nir_print_instr(&instr->instr, stderr);
1883
                fprintf(stderr, "\n");
1884
                break;
1885
        }
1886
}
1887

1888
/* Clears (activates) the execute flags for any channels whose jump target
1889
 * matches this block.
1890
 */
1891
static void
1892
ntq_activate_execute_for_block(struct vc4_compile *c)
1893
{
1894
        qir_SF(c, qir_SUB(c,
1895
                          c->execute,
1896
                          qir_uniform_ui(c, c->cur_block->index)));
1897
        qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
1898
}
1899

1900
static void
1901
ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1902
{
1903
        if (!c->vc4->screen->has_control_flow) {
1904
                fprintf(stderr,
1905
                        "IF statement support requires updated kernel.\n");
1906
                return;
1907
        }
1908

1909
        nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1910
        bool empty_else_block =
1911
                (nir_else_block == nir_if_last_else_block(if_stmt) &&
1912
                 exec_list_is_empty(&nir_else_block->instr_list));
1913

1914
        struct qblock *then_block = qir_new_block(c);
1915
        struct qblock *after_block = qir_new_block(c);
1916
        struct qblock *else_block;
1917
        if (empty_else_block)
1918
                else_block = after_block;
1919
        else
1920
                else_block = qir_new_block(c);
1921

1922
        bool was_top_level = false;
1923
        if (c->execute.file == QFILE_NULL) {
1924
                c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
1925
                was_top_level = true;
1926
        }
1927

1928
        /* Set ZS for executing (execute == 0) and jumping (if->condition ==
1929
         * 0) channels, and then update execute flags for those to point to
1930
         * the ELSE block.
1931
         */
1932
        qir_SF(c, qir_OR(c,
1933
                         c->execute,
1934
                         ntq_get_src(c, if_stmt->condition, 0)));
1935
        qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1936
                     qir_uniform_ui(c, else_block->index));
1937

1938
        /* Jump to ELSE if nothing is active for THEN, otherwise fall
1939
         * through.
1940
         */
1941
        qir_SF(c, c->execute);
1942
        qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
1943
        qir_link_blocks(c->cur_block, else_block);
1944
        qir_link_blocks(c->cur_block, then_block);
1945

1946
        /* Process the THEN block. */
1947
        qir_set_emit_block(c, then_block);
1948
        ntq_emit_cf_list(c, &if_stmt->then_list);
1949

1950
        if (!empty_else_block) {
1951
                /* Handle the end of the THEN block.  First, all currently
1952
                 * active channels update their execute flags to point to
1953
                 * ENDIF
1954
                 */
1955
                qir_SF(c, c->execute);
1956
                qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1957
                             qir_uniform_ui(c, after_block->index));
1958

1959
                /* If everything points at ENDIF, then jump there immediately. */
1960
                qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
1961
                qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1962
                qir_link_blocks(c->cur_block, after_block);
1963
                qir_link_blocks(c->cur_block, else_block);
1964

1965
                qir_set_emit_block(c, else_block);
1966
                ntq_activate_execute_for_block(c);
1967
                ntq_emit_cf_list(c, &if_stmt->else_list);
1968
        }
1969

1970
        qir_link_blocks(c->cur_block, after_block);
1971

1972
        qir_set_emit_block(c, after_block);
1973
        if (was_top_level) {
1974
                c->execute = c->undef;
1975
                c->last_top_block = c->cur_block;
1976
        } else {
1977
                ntq_activate_execute_for_block(c);
1978
        }
1979
}
1980

1981
static void
1982
ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
1983
{
1984
        struct qblock *jump_block;
1985
        switch (jump->type) {
1986
        case nir_jump_break:
1987
                jump_block = c->loop_break_block;
1988
                break;
1989
        case nir_jump_continue:
1990
                jump_block = c->loop_cont_block;
1991
                break;
1992
        default:
1993
                unreachable("Unsupported jump type\n");
1994
        }
1995

1996
        qir_SF(c, c->execute);
1997
        qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1998
                     qir_uniform_ui(c, jump_block->index));
1999

2000
        /* Jump to the destination block if everyone has taken the jump. */
2001
        qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
2002
        qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
2003
        struct qblock *new_block = qir_new_block(c);
2004
        qir_link_blocks(c->cur_block, jump_block);
2005
        qir_link_blocks(c->cur_block, new_block);
2006
        qir_set_emit_block(c, new_block);
2007
}
2008

2009
static void
2010
ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
2011
{
2012
        switch (instr->type) {
2013
        case nir_instr_type_alu:
2014
                ntq_emit_alu(c, nir_instr_as_alu(instr));
2015
                break;
2016

2017
        case nir_instr_type_intrinsic:
2018
                ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2019
                break;
2020

2021
        case nir_instr_type_load_const:
2022
                ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2023
                break;
2024

2025
        case nir_instr_type_ssa_undef:
2026
                ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2027
                break;
2028

2029
        case nir_instr_type_tex:
2030
                ntq_emit_tex(c, nir_instr_as_tex(instr));
2031
                break;
2032

2033
        case nir_instr_type_jump:
2034
                ntq_emit_jump(c, nir_instr_as_jump(instr));
2035
                break;
2036

2037
        default:
2038
                fprintf(stderr, "Unknown NIR instr type: ");
2039
                nir_print_instr(instr, stderr);
2040
                fprintf(stderr, "\n");
2041
                abort();
2042
        }
2043
}
2044

2045
static void
2046
ntq_emit_block(struct vc4_compile *c, nir_block *block)
2047
{
2048
        nir_foreach_instr(instr, block) {
2049
                ntq_emit_instr(c, instr);
2050
        }
2051
}
2052

2053
static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
2054

2055
static void
2056
ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
2057
{
2058
        if (!c->vc4->screen->has_control_flow) {
2059
                fprintf(stderr,
2060
                        "loop support requires updated kernel.\n");
2061
                ntq_emit_cf_list(c, &loop->body);
2062
                return;
2063
        }
2064

2065
        bool was_top_level = false;
2066
        if (c->execute.file == QFILE_NULL) {
2067
                c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
2068
                was_top_level = true;
2069
        }
2070

2071
        struct qblock *save_loop_cont_block = c->loop_cont_block;
2072
        struct qblock *save_loop_break_block = c->loop_break_block;
2073

2074
        c->loop_cont_block = qir_new_block(c);
2075
        c->loop_break_block = qir_new_block(c);
2076

2077
        qir_link_blocks(c->cur_block, c->loop_cont_block);
2078
        qir_set_emit_block(c, c->loop_cont_block);
2079
        ntq_activate_execute_for_block(c);
2080

2081
        ntq_emit_cf_list(c, &loop->body);
2082

2083
        /* If anything had explicitly continued, or is here at the end of the
2084
         * loop, then we need to loop again.  SF updates are masked by the
2085
         * instruction's condition, so we can do the OR of the two conditions
2086
         * within SF.
2087
         */
2088
        qir_SF(c, c->execute);
2089
        struct qinst *cont_check =
2090
                qir_SUB_dest(c,
2091
                             c->undef,
2092
                             c->execute,
2093
                             qir_uniform_ui(c, c->loop_cont_block->index));
2094
        cont_check->cond = QPU_COND_ZC;
2095
        cont_check->sf = true;
2096

2097
        qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
2098
        qir_link_blocks(c->cur_block, c->loop_cont_block);
2099
        qir_link_blocks(c->cur_block, c->loop_break_block);
2100

2101
        qir_set_emit_block(c, c->loop_break_block);
2102
        if (was_top_level) {
2103
                c->execute = c->undef;
2104
                c->last_top_block = c->cur_block;
2105
        } else {
2106
                ntq_activate_execute_for_block(c);
2107
        }
2108

2109
        c->loop_break_block = save_loop_break_block;
2110
        c->loop_cont_block = save_loop_cont_block;
2111
}
2112

2113
static void
2114
ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
2115
{
2116
        fprintf(stderr, "FUNCTIONS not handled.\n");
2117
        abort();
2118
}
2119

2120
static void
2121
ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
2122
{
2123
        foreach_list_typed(nir_cf_node, node, node, list) {
2124
                switch (node->type) {
2125
                case nir_cf_node_block:
2126
                        ntq_emit_block(c, nir_cf_node_as_block(node));
2127
                        break;
2128

2129
                case nir_cf_node_if:
2130
                        ntq_emit_if(c, nir_cf_node_as_if(node));
2131
                        break;
2132

2133
                case nir_cf_node_loop:
2134
                        ntq_emit_loop(c, nir_cf_node_as_loop(node));
2135
                        break;
2136

2137
                case nir_cf_node_function:
2138
                        ntq_emit_function(c, nir_cf_node_as_function(node));
2139
                        break;
2140

2141
                default:
2142
                        fprintf(stderr, "Unknown NIR node type\n");
2143
                        abort();
2144
                }
2145
        }
2146
}
2147

2148
static void
2149
ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
2150
{
2151
        ntq_setup_registers(c, &impl->registers);
2152
        ntq_emit_cf_list(c, &impl->body);
2153
}
2154

2155
static void
2156
nir_to_qir(struct vc4_compile *c)
2157
{
2158
        if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
2159
                c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
2160

2161
        ntq_setup_inputs(c);
2162
        ntq_setup_outputs(c);
2163

2164
        /* Find the main function and emit the body. */
2165
        nir_foreach_function(function, c->s) {
2166
                assert(strcmp(function->name, "main") == 0);
2167
                assert(function->impl);
2168
                ntq_emit_impl(c, function->impl);
2169
        }
2170
}
2171

2172
static const nir_shader_compiler_options nir_options = {
2173
        .lower_all_io_to_temps = true,
2174
        .lower_extract_byte = true,
2175
        .lower_extract_word = true,
2176
        .lower_insert_byte = true,
2177
        .lower_insert_word = true,
2178
        .lower_fdiv = true,
2179
        .lower_ffma16 = true,
2180
        .lower_ffma32 = true,
2181
        .lower_ffma64 = true,
2182
        .lower_flrp32 = true,
2183
        .lower_fmod = true,
2184
        .lower_fpow = true,
2185
        .lower_fsat = true,
2186
        .lower_fsqrt = true,
2187
        .lower_ldexp = true,
2188
        .lower_fneg = true,
2189
        .lower_ineg = true,
2190
        .lower_rotate = true,
2191
        .lower_to_scalar = true,
2192
        .lower_umax = true,
2193
        .lower_umin = true,
2194
        .lower_isign = true,
2195
        .has_fsub = true,
2196
        .has_isub = true,
2197
        .max_unroll_iterations = 32,
2198
};
2199

2200
const void *
2201
vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
2202
                                enum pipe_shader_ir ir,
2203
                                enum pipe_shader_type shader)
2204
{
2205
        return &nir_options;
2206
}
2207

2208
static int
2209
count_nir_instrs(nir_shader *nir)
2210
{
2211
        int count = 0;
2212
        nir_foreach_function(function, nir) {
2213
                if (!function->impl)
2214
                        continue;
2215
                nir_foreach_block(block, function->impl) {
2216
                        nir_foreach_instr(instr, block)
2217
                                count++;
2218
                }
2219
        }
2220
        return count;
2221
}
2222

2223
static struct vc4_compile *
2224
vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2225
               struct vc4_key *key, bool fs_threaded)
2226
{
2227
        struct vc4_compile *c = qir_compile_init();
2228

2229
        c->vc4 = vc4;
2230
        c->stage = stage;
2231
        c->shader_state = &key->shader_state->base;
2232
        c->program_id = key->shader_state->program_id;
2233
        c->variant_id =
2234
                p_atomic_inc_return(&key->shader_state->compiled_variant_count);
2235
        c->fs_threaded = fs_threaded;
2236

2237
        c->key = key;
2238
        switch (stage) {
2239
        case QSTAGE_FRAG:
2240
                c->fs_key = (struct vc4_fs_key *)key;
2241
                if (c->fs_key->is_points) {
2242
                        c->point_x = emit_fragment_varying(c, ~0, 0);
2243
                        c->point_y = emit_fragment_varying(c, ~0, 0);
2244
                } else if (c->fs_key->is_lines) {
2245
                        c->line_x = emit_fragment_varying(c, ~0, 0);
2246
                }
2247
                break;
2248
        case QSTAGE_VERT:
2249
                c->vs_key = (struct vc4_vs_key *)key;
2250
                break;
2251
        case QSTAGE_COORD:
2252
                c->vs_key = (struct vc4_vs_key *)key;
2253
                break;
2254
        }
2255

2256
        c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
2257

2258
        if (stage == QSTAGE_FRAG) {
2259
                NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
2260
        }
2261

2262
        struct nir_lower_tex_options tex_options = {
2263
                .lower_txp = ~0,
2264

2265
                /* Apply swizzles to all samplers. */
2266
                .swizzle_result = ~0,
2267
        };
2268

2269
        /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
2270
         * The format swizzling applies before sRGB decode, and
2271
         * ARB_texture_swizzle is the last thing before returning the sample.
2272
         */
2273
        for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
2274
                enum pipe_format format = c->key->tex[i].format;
2275

2276
                if (!format)
2277
                        continue;
2278

2279
                const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
2280

2281
                for (int j = 0; j < 4; j++) {
2282
                        uint8_t arb_swiz = c->key->tex[i].swizzle[j];
2283

2284
                        if (arb_swiz <= 3) {
2285
                                tex_options.swizzles[i][j] =
2286
                                        format_swizzle[arb_swiz];
2287
                        } else {
2288
                                tex_options.swizzles[i][j] = arb_swiz;
2289
                        }
2290
                }
2291

2292
                if (util_format_is_srgb(format))
2293
                        tex_options.lower_srgb |= (1 << i);
2294
        }
2295

2296
        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
2297

2298
        if (c->key->ucp_enables) {
2299
                if (stage == QSTAGE_FRAG) {
2300
                        NIR_PASS_V(c->s, nir_lower_clip_fs,
2301
                                   c->key->ucp_enables, false);
2302
                } else {
2303
                        NIR_PASS_V(c->s, nir_lower_clip_vs,
2304
                                   c->key->ucp_enables, false, false, NULL);
2305
                        NIR_PASS_V(c->s, nir_lower_io_to_scalar,
2306
                                   nir_var_shader_out);
2307
                }
2308
        }
2309

2310
        /* FS input scalarizing must happen after nir_lower_two_sided_color,
2311
         * which only handles a vec4 at a time.  Similarly, VS output
2312
         * scalarizing must happen after nir_lower_clip_vs.
2313
         */
2314
        if (c->stage == QSTAGE_FRAG)
2315
                NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
2316
        else
2317
                NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
2318

2319
        NIR_PASS_V(c->s, vc4_nir_lower_io, c);
2320
        NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
2321
        nir_lower_idiv_options idiv_options = {
2322
                .imprecise_32bit_lowering = true,
2323
                .allow_fp16 = true,
2324
        };
2325
        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
2326

2327
        vc4_optimize_nir(c->s);
2328

2329
        /* Do late algebraic optimization to turn add(a, neg(b)) back into
2330
         * subs, then the mandatory cleanup after algebraic.  Note that it may
2331
         * produce fnegs, and if so then we need to keep running to squash
2332
         * fneg(fneg(a)).
2333
         */
2334
        bool more_late_algebraic = true;
2335
        while (more_late_algebraic) {
2336
                more_late_algebraic = false;
2337
                NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
2338
                NIR_PASS_V(c->s, nir_opt_constant_folding);
2339
                NIR_PASS_V(c->s, nir_copy_prop);
2340
                NIR_PASS_V(c->s, nir_opt_dce);
2341
                NIR_PASS_V(c->s, nir_opt_cse);
2342
        }
2343

2344
        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
2345

2346
        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
2347

2348
        if (vc4_debug & VC4_DEBUG_SHADERDB) {
2349
                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
2350
                        qir_get_stage_name(c->stage),
2351
                        c->program_id, c->variant_id,
2352
                        count_nir_instrs(c->s));
2353
        }
2354

2355
        if (vc4_debug & VC4_DEBUG_NIR) {
2356
                fprintf(stderr, "%s prog %d/%d NIR:\n",
2357
                        qir_get_stage_name(c->stage),
2358
                        c->program_id, c->variant_id);
2359
                nir_print_shader(c->s, stderr);
2360
        }
2361

2362
        nir_to_qir(c);
2363

2364
        switch (stage) {
2365
        case QSTAGE_FRAG:
2366
                /* FS threading requires that the thread execute
2367
                 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
2368
                 * (with no other THRSW afterwards, obviously).  If we didn't
2369
                 * fetch a texture at a top level block, this wouldn't be
2370
                 * true.
2371
                 */
2372
                if (c->fs_threaded && !c->last_thrsw_at_top_level) {
2373
                        c->failed = true;
2374
                        return c;
2375
                }
2376

2377
                emit_frag_end(c);
2378
                break;
2379
        case QSTAGE_VERT:
2380
                emit_vert_end(c,
2381
                              c->vs_key->fs_inputs->input_slots,
2382
                              c->vs_key->fs_inputs->num_inputs);
2383
                break;
2384
        case QSTAGE_COORD:
2385
                emit_coord_end(c);
2386
                break;
2387
        }
2388

2389
        if (vc4_debug & VC4_DEBUG_QIR) {
2390
                fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2391
                        qir_get_stage_name(c->stage),
2392
                        c->program_id, c->variant_id);
2393
                qir_dump(c);
2394
                fprintf(stderr, "\n");
2395
        }
2396

2397
        qir_optimize(c);
2398
        qir_lower_uniforms(c);
2399

2400
        qir_schedule_instructions(c);
2401
        qir_emit_uniform_stream_resets(c);
2402

2403
        if (vc4_debug & VC4_DEBUG_QIR) {
2404
                fprintf(stderr, "%s prog %d/%d QIR:\n",
2405
                        qir_get_stage_name(c->stage),
2406
                        c->program_id, c->variant_id);
2407
                qir_dump(c);
2408
                fprintf(stderr, "\n");
2409
        }
2410

2411
        qir_reorder_uniforms(c);
2412
        vc4_generate_code(vc4, c);
2413

2414
        if (vc4_debug & VC4_DEBUG_SHADERDB) {
2415
                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
2416
                        qir_get_stage_name(c->stage),
2417
                        c->program_id, c->variant_id,
2418
                        c->qpu_inst_count);
2419
                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n",
2420
                        qir_get_stage_name(c->stage),
2421
                        c->program_id, c->variant_id,
2422
                        c->num_uniforms);
2423
        }
2424

2425
        ralloc_free(c->s);
2426

2427
        return c;
2428
}
2429

2430
static void *
2431
vc4_shader_state_create(struct pipe_context *pctx,
2432
                        const struct pipe_shader_state *cso)
2433
{
2434
        struct vc4_context *vc4 = vc4_context(pctx);
2435
        struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2436
        if (!so)
2437
                return NULL;
2438

2439
        so->program_id = vc4->next_uncompiled_program_id++;
2440

2441
        nir_shader *s;
2442

2443
        if (cso->type == PIPE_SHADER_IR_NIR) {
2444
                /* The backend takes ownership of the NIR shader on state
2445
                 * creation.
2446
                 */
2447
                s = cso->ir.nir;
2448
       } else {
2449
                assert(cso->type == PIPE_SHADER_IR_TGSI);
2450

2451
                if (vc4_debug & VC4_DEBUG_TGSI) {
2452
                        fprintf(stderr, "prog %d TGSI:\n",
2453
                                so->program_id);
2454
                        tgsi_dump(cso->tokens, 0);
2455
                        fprintf(stderr, "\n");
2456
                }
2457
                s = tgsi_to_nir(cso->tokens, pctx->screen, false);
2458
        }
2459

2460
        if (s->info.stage == MESA_SHADER_VERTEX)
2461
                NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);
2462

2463
        NIR_PASS_V(s, nir_lower_io,
2464
                   nir_var_shader_in | nir_var_shader_out | nir_var_uniform,
2465
                   type_size, (nir_lower_io_options)0);
2466

2467
        NIR_PASS_V(s, nir_lower_regs_to_ssa);
2468
        NIR_PASS_V(s, nir_normalize_cubemap_coords);
2469

2470
        NIR_PASS_V(s, nir_lower_load_const_to_scalar);
2471

2472
        vc4_optimize_nir(s);
2473

2474
        NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
2475

2476
        /* Garbage collect dead instructions */
2477
        nir_sweep(s);
2478

2479
        so->base.type = PIPE_SHADER_IR_NIR;
2480
        so->base.ir.nir = s;
2481

2482
        if (vc4_debug & VC4_DEBUG_NIR) {
2483
                fprintf(stderr, "%s prog %d NIR:\n",
2484
                        gl_shader_stage_name(s->info.stage),
2485
                        so->program_id);
2486
                nir_print_shader(s, stderr);
2487
                fprintf(stderr, "\n");
2488
        }
2489

2490
        return so;
2491
}
2492

2493
static void
2494
copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2495
                             struct vc4_compile *c)
2496
{
2497
        int count = c->num_uniforms;
2498
        struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2499

2500
        uinfo->count = count;
2501
        uinfo->data = ralloc_array(shader, uint32_t, count);
2502
        memcpy(uinfo->data, c->uniform_data,
2503
               count * sizeof(*uinfo->data));
2504
        uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2505
        memcpy(uinfo->contents, c->uniform_contents,
2506
               count * sizeof(*uinfo->contents));
2507
        uinfo->num_texture_samples = c->num_texture_samples;
2508

2509
        vc4_set_shader_uniform_dirty_flags(shader);
2510
}
2511

2512
static void
2513
vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
2514
                             struct vc4_compiled_shader *shader)
2515
{
2516
        struct vc4_fs_inputs inputs;
2517

2518
        memset(&inputs, 0, sizeof(inputs));
2519
        inputs.input_slots = ralloc_array(shader,
2520
                                          struct vc4_varying_slot,
2521
                                          c->num_input_slots);
2522

2523
        bool input_live[c->num_input_slots];
2524

2525
        memset(input_live, 0, sizeof(input_live));
2526
        qir_for_each_inst_inorder(inst, c) {
2527
                for (int i = 0; i < qir_get_nsrc(inst); i++) {
2528
                        if (inst->src[i].file == QFILE_VARY)
2529
                                input_live[inst->src[i].index] = true;
2530
                }
2531
        }
2532

2533
        for (int i = 0; i < c->num_input_slots; i++) {
2534
                struct vc4_varying_slot *slot = &c->input_slots[i];
2535

2536
                if (!input_live[i])
2537
                        continue;
2538

2539
                /* Skip non-VS-output inputs. */
2540
                if (slot->slot == (uint8_t)~0)
2541
                        continue;
2542

2543
                if (slot->slot == VARYING_SLOT_COL0 ||
2544
                    slot->slot == VARYING_SLOT_COL1 ||
2545
                    slot->slot == VARYING_SLOT_BFC0 ||
2546
                    slot->slot == VARYING_SLOT_BFC1) {
2547
                        shader->color_inputs |= (1 << inputs.num_inputs);
2548
                }
2549

2550
                inputs.input_slots[inputs.num_inputs] = *slot;
2551
                inputs.num_inputs++;
2552
        }
2553
        shader->num_inputs = inputs.num_inputs;
2554

2555
        /* Add our set of inputs to the set of all inputs seen.  This way, we
2556
         * can have a single pointer that identifies an FS inputs set,
2557
         * allowing VS to avoid recompiling when the FS is recompiled (or a
2558
         * new one is bound using separate shader objects) but the inputs
2559
         * don't change.
2560
         */
2561
        struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
2562
        if (entry) {
2563
                shader->fs_inputs = entry->key;
2564
                ralloc_free(inputs.input_slots);
2565
        } else {
2566
                struct vc4_fs_inputs *alloc_inputs;
2567

2568
                alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
2569
                memcpy(alloc_inputs, &inputs, sizeof(inputs));
2570
                ralloc_steal(alloc_inputs, inputs.input_slots);
2571
                _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
2572

2573
                shader->fs_inputs = alloc_inputs;
2574
        }
2575
}
2576

2577
static struct vc4_compiled_shader *
2578
vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2579
                        struct vc4_key *key)
2580
{
2581
        struct hash_table *ht;
2582
        uint32_t key_size;
2583
        bool try_threading;
2584

2585
        if (stage == QSTAGE_FRAG) {
2586
                ht = vc4->fs_cache;
2587
                key_size = sizeof(struct vc4_fs_key);
2588
                try_threading = vc4->screen->has_threaded_fs;
2589
        } else {
2590
                ht = vc4->vs_cache;
2591
                key_size = sizeof(struct vc4_vs_key);
2592
                try_threading = false;
2593
        }
2594

2595
        struct vc4_compiled_shader *shader;
2596
        struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2597
        if (entry)
2598
                return entry->data;
2599

2600
        struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
2601
        /* If the FS failed to compile threaded, fall back to single threaded. */
2602
        if (try_threading && c->failed) {
2603
                qir_compile_destroy(c);
2604
                c = vc4_shader_ntq(vc4, stage, key, false);
2605
        }
2606

2607
        shader = rzalloc(NULL, struct vc4_compiled_shader);
2608

2609
        shader->program_id = vc4->next_compiled_program_id++;
2610
        if (stage == QSTAGE_FRAG) {
2611
                vc4_setup_compiled_fs_inputs(vc4, c, shader);
2612

2613
                /* Note: the temporary clone in c->s has been freed. */
2614
                nir_shader *orig_shader = key->shader_state->base.ir.nir;
2615
                if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
2616
                        shader->disable_early_z = true;
2617
        } else {
2618
                shader->num_inputs = c->num_inputs;
2619

2620
                shader->vattr_offsets[0] = 0;
2621
                for (int i = 0; i < 8; i++) {
2622
                        shader->vattr_offsets[i + 1] =
2623
                                shader->vattr_offsets[i] + c->vattr_sizes[i];
2624

2625
                        if (c->vattr_sizes[i])
2626
                                shader->vattrs_live |= (1 << i);
2627
                }
2628
        }
2629

2630
        shader->failed = c->failed;
2631
        if (c->failed) {
2632
                shader->failed = true;
2633
        } else {
2634
                copy_uniform_state_to_shader(shader, c);
2635
                shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
2636
                                                 c->qpu_inst_count *
2637
                                                 sizeof(uint64_t));
2638
        }
2639

2640
        shader->fs_threaded = c->fs_threaded;
2641

2642
        if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {
2643
                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",
2644
                        qir_get_stage_name(c->stage),
2645
                        c->program_id, c->variant_id,
2646
                        1 + shader->fs_threaded);
2647
        }
2648

2649
        qir_compile_destroy(c);
2650

2651
        struct vc4_key *dup_key;
2652
        dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
2653
        memcpy(dup_key, key, key_size);
2654
        _mesa_hash_table_insert(ht, dup_key, shader);
2655

2656
        return shader;
2657
}
2658

2659
static void
2660
vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2661
                     struct vc4_texture_stateobj *texstate)
2662
{
2663
        for (int i = 0; i < texstate->num_textures; i++) {
2664
                struct pipe_sampler_view *sampler = texstate->textures[i];
2665
                struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
2666
                struct pipe_sampler_state *sampler_state =
2667
                        texstate->samplers[i];
2668

2669
                if (!sampler)
2670
                        continue;
2671

2672
                key->tex[i].format = sampler->format;
2673
                key->tex[i].swizzle[0] = sampler->swizzle_r;
2674
                key->tex[i].swizzle[1] = sampler->swizzle_g;
2675
                key->tex[i].swizzle[2] = sampler->swizzle_b;
2676
                key->tex[i].swizzle[3] = sampler->swizzle_a;
2677

2678
                if (sampler->texture->nr_samples > 1) {
2679
                        key->tex[i].msaa_width = sampler->texture->width0;
2680
                        key->tex[i].msaa_height = sampler->texture->height0;
2681
                } else if (sampler){
2682
                        key->tex[i].compare_mode = sampler_state->compare_mode;
2683
                        key->tex[i].compare_func = sampler_state->compare_func;
2684
                        key->tex[i].wrap_s = sampler_state->wrap_s;
2685
                        key->tex[i].wrap_t = sampler_state->wrap_t;
2686
                        key->tex[i].force_first_level =
2687
                                vc4_sampler->force_first_level;
2688
                }
2689
        }
2690

2691
        key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2692
}
2693

2694
static void
2695
vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2696
{
2697
        struct vc4_job *job = vc4->job;
2698
        struct vc4_fs_key local_key;
2699
        struct vc4_fs_key *key = &local_key;
2700

2701
        if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2702
                            VC4_DIRTY_BLEND |
2703
                            VC4_DIRTY_FRAMEBUFFER |
2704
                            VC4_DIRTY_ZSA |
2705
                            VC4_DIRTY_RASTERIZER |
2706
                            VC4_DIRTY_SAMPLE_MASK |
2707
                            VC4_DIRTY_FRAGTEX |
2708
                            VC4_DIRTY_UNCOMPILED_FS |
2709
                            VC4_DIRTY_UBO_1_SIZE))) {
2710
                return;
2711
        }
2712

2713
        memset(key, 0, sizeof(*key));
2714
        vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2715
        key->base.shader_state = vc4->prog.bind_fs;
2716
        key->is_points = (prim_mode == PIPE_PRIM_POINTS);
2717
        key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
2718
                         prim_mode <= PIPE_PRIM_LINE_STRIP);
2719
        key->blend = vc4->blend->rt[0];
2720
        if (vc4->blend->logicop_enable) {
2721
                key->logicop_func = vc4->blend->logicop_func;
2722
        } else {
2723
                key->logicop_func = PIPE_LOGICOP_COPY;
2724
        }
2725
        if (job->msaa) {
2726
                key->msaa = vc4->rasterizer->base.multisample;
2727
                key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
2728
                key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
2729
                key->sample_alpha_to_one = vc4->blend->alpha_to_one;
2730
        }
2731

2732
        if (vc4->framebuffer.cbufs[0])
2733
                key->color_format = vc4->framebuffer.cbufs[0]->format;
2734

2735
        key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2736
        key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2737
        key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2738
        key->depth_enabled = (vc4->zsa->base.depth_enabled ||
2739
                              key->stencil_enabled);
2740

2741
        if (key->is_points) {
2742
                key->point_sprite_mask =
2743
                        vc4->rasterizer->base.sprite_coord_enable;
2744
                key->point_coord_upper_left =
2745
                        (vc4->rasterizer->base.sprite_coord_mode ==
2746
                         PIPE_SPRITE_COORD_UPPER_LEFT);
2747
        }
2748

2749
        key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
2750

2751
        struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2752
        vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2753
        if (vc4->prog.fs == old_fs)
2754
                return;
2755

2756
        vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2757

2758
        if (vc4->rasterizer->base.flatshade &&
2759
            (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
2760
                vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2761
        }
2762

2763
        if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
2764
                vc4->dirty |= VC4_DIRTY_FS_INPUTS;
2765
}
2766

2767
static void
2768
vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2769
{
2770
        struct vc4_vs_key local_key;
2771
        struct vc4_vs_key *key = &local_key;
2772

2773
        if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2774
                            VC4_DIRTY_RASTERIZER |
2775
                            VC4_DIRTY_VERTTEX |
2776
                            VC4_DIRTY_VTXSTATE |
2777
                            VC4_DIRTY_UNCOMPILED_VS |
2778
                            VC4_DIRTY_FS_INPUTS))) {
2779
                return;
2780
        }
2781

2782
        memset(key, 0, sizeof(*key));
2783
        vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2784
        key->base.shader_state = vc4->prog.bind_vs;
2785
        key->fs_inputs = vc4->prog.fs->fs_inputs;
2786

2787
        for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2788
                key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2789

2790
        key->per_vertex_point_size =
2791
                (prim_mode == PIPE_PRIM_POINTS &&
2792
                 vc4->rasterizer->base.point_size_per_vertex);
2793

2794
        struct vc4_compiled_shader *vs =
2795
                vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2796
        if (vs != vc4->prog.vs) {
2797
                vc4->prog.vs = vs;
2798
                vc4->dirty |= VC4_DIRTY_COMPILED_VS;
2799
        }
2800

2801
        key->is_coord = true;
2802
        /* Coord shaders don't care what the FS inputs are. */
2803
        key->fs_inputs = NULL;
2804
        struct vc4_compiled_shader *cs =
2805
                vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2806
        if (cs != vc4->prog.cs) {
2807
                vc4->prog.cs = cs;
2808
                vc4->dirty |= VC4_DIRTY_COMPILED_CS;
2809
        }
2810
}
2811

2812
bool
2813
vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2814
{
2815
        vc4_update_compiled_fs(vc4, prim_mode);
2816
        vc4_update_compiled_vs(vc4, prim_mode);
2817

2818
        return !(vc4->prog.cs->failed ||
2819
                 vc4->prog.vs->failed ||
2820
                 vc4->prog.fs->failed);
2821
}
2822

2823
static uint32_t
2824
fs_cache_hash(const void *key)
2825
{
2826
        return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2827
}
2828

2829
static uint32_t
2830
vs_cache_hash(const void *key)
2831
{
2832
        return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2833
}
2834

2835
static bool
2836
fs_cache_compare(const void *key1, const void *key2)
2837
{
2838
        return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2839
}
2840

2841
static bool
2842
vs_cache_compare(const void *key1, const void *key2)
2843
{
2844
        return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2845
}
2846

2847
static uint32_t
2848
fs_inputs_hash(const void *key)
2849
{
2850
        const struct vc4_fs_inputs *inputs = key;
2851

2852
        return _mesa_hash_data(inputs->input_slots,
2853
                               sizeof(*inputs->input_slots) *
2854
                               inputs->num_inputs);
2855
}
2856

2857
static bool
2858
fs_inputs_compare(const void *key1, const void *key2)
2859
{
2860
        const struct vc4_fs_inputs *inputs1 = key1;
2861
        const struct vc4_fs_inputs *inputs2 = key2;
2862

2863
        return (inputs1->num_inputs == inputs2->num_inputs &&
2864
                memcmp(inputs1->input_slots,
2865
                       inputs2->input_slots,
2866
                       sizeof(*inputs1->input_slots) *
2867
                       inputs1->num_inputs) == 0);
2868
}
2869

2870
static void
2871
delete_from_cache_if_matches(struct hash_table *ht,
2872
                             struct vc4_compiled_shader **last_compile,
2873
                             struct hash_entry *entry,
2874
                             struct vc4_uncompiled_shader *so)
2875
{
2876
        const struct vc4_key *key = entry->key;
2877

2878
        if (key->shader_state == so) {
2879
                struct vc4_compiled_shader *shader = entry->data;
2880
                _mesa_hash_table_remove(ht, entry);
2881
                vc4_bo_unreference(&shader->bo);
2882

2883
                if (shader == *last_compile)
2884
                        *last_compile = NULL;
2885

2886
                ralloc_free(shader);
2887
        }
2888
}
2889

2890
static void
2891
vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2892
{
2893
        struct vc4_context *vc4 = vc4_context(pctx);
2894
        struct vc4_uncompiled_shader *so = hwcso;
2895

2896
        hash_table_foreach(vc4->fs_cache, entry) {
2897
                delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
2898
                                             entry, so);
2899
        }
2900
        hash_table_foreach(vc4->vs_cache, entry) {
2901
                delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
2902
                                             entry, so);
2903
        }
2904

2905
        ralloc_free(so->base.ir.nir);
2906
        free(so);
2907
}
2908

2909
static void
2910
vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2911
{
2912
        struct vc4_context *vc4 = vc4_context(pctx);
2913
        vc4->prog.bind_fs = hwcso;
2914
        vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2915
}
2916

2917
static void
2918
vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2919
{
2920
        struct vc4_context *vc4 = vc4_context(pctx);
2921
        vc4->prog.bind_vs = hwcso;
2922
        vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
2923
}
2924

2925
void
2926
vc4_program_init(struct pipe_context *pctx)
2927
{
2928
        struct vc4_context *vc4 = vc4_context(pctx);
2929

2930
        pctx->create_vs_state = vc4_shader_state_create;
2931
        pctx->delete_vs_state = vc4_shader_state_delete;
2932

2933
        pctx->create_fs_state = vc4_shader_state_create;
2934
        pctx->delete_fs_state = vc4_shader_state_delete;
2935

2936
        pctx->bind_fs_state = vc4_fp_state_bind;
2937
        pctx->bind_vs_state = vc4_vp_state_bind;
2938

2939
        vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
2940
                                                fs_cache_compare);
2941
        vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
2942
                                                vs_cache_compare);
2943
        vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
2944
                                              fs_inputs_compare);
2945
}
2946

2947
void
2948
vc4_program_fini(struct pipe_context *pctx)
2949
{
2950
        struct vc4_context *vc4 = vc4_context(pctx);
2951

2952
        hash_table_foreach(vc4->fs_cache, entry) {
2953
                struct vc4_compiled_shader *shader = entry->data;
2954
                vc4_bo_unreference(&shader->bo);
2955
                ralloc_free(shader);
2956
                _mesa_hash_table_remove(vc4->fs_cache, entry);
2957
        }
2958

2959
        hash_table_foreach(vc4->vs_cache, entry) {
2960
                struct vc4_compiled_shader *shader = entry->data;
2961
                vc4_bo_unreference(&shader->bo);
2962
                ralloc_free(shader);
2963
                _mesa_hash_table_remove(vc4->vs_cache, entry);
2964
        }
2965
}
2966

2967
Product

Resources

Company