Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/vc4/vc4_program.c
4570 views
1
/*
2
* Copyright (c) 2014 Scott Mansell
3
* Copyright © 2014 Broadcom
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
* IN THE SOFTWARE.
23
*/
24
25
#include <inttypes.h>
26
#include "util/format/u_format.h"
27
#include "util/crc32.h"
28
#include "util/u_helpers.h"
29
#include "util/u_math.h"
30
#include "util/u_memory.h"
31
#include "util/ralloc.h"
32
#include "util/hash_table.h"
33
#include "tgsi/tgsi_dump.h"
34
#include "tgsi/tgsi_parse.h"
35
#include "compiler/nir/nir.h"
36
#include "compiler/nir/nir_builder.h"
37
#include "compiler/nir_types.h"
38
#include "nir/tgsi_to_nir.h"
39
#include "vc4_context.h"
40
#include "vc4_qpu.h"
41
#include "vc4_qir.h"
42
43
static struct qreg
44
ntq_get_src(struct vc4_compile *c, nir_src src, int i);
45
static void
46
ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
47
48
static int
49
type_size(const struct glsl_type *type, bool bindless)
50
{
51
return glsl_count_attribute_slots(type, false);
52
}
53
54
static void
55
resize_qreg_array(struct vc4_compile *c,
56
struct qreg **regs,
57
uint32_t *size,
58
uint32_t decl_size)
59
{
60
if (*size >= decl_size)
61
return;
62
63
uint32_t old_size = *size;
64
*size = MAX2(*size * 2, decl_size);
65
*regs = reralloc(c, *regs, struct qreg, *size);
66
if (!*regs) {
67
fprintf(stderr, "Malloc failure\n");
68
abort();
69
}
70
71
for (uint32_t i = old_size; i < *size; i++)
72
(*regs)[i] = c->undef;
73
}
74
75
static void
76
ntq_emit_thrsw(struct vc4_compile *c)
77
{
78
if (!c->fs_threaded)
79
return;
80
81
/* Always thread switch after each texture operation for now.
82
*
83
* We could do better by batching a bunch of texture fetches up and
84
* then doing one thread switch and collecting all their results
85
* afterward.
86
*/
87
qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
88
c->undef, c->undef));
89
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
90
}
91
92
static struct qreg
93
indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
94
{
95
struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
96
97
/* Clamp to [0, array size). Note that MIN/MAX are signed. */
98
uint32_t range = nir_intrinsic_range(intr);
99
indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
100
indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
101
qir_uniform_ui(c, range - 4));
102
103
qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
104
indirect_offset,
105
qir_uniform(c, QUNIFORM_UBO0_ADDR,
106
nir_intrinsic_base(intr)));
107
108
c->num_texture_samples++;
109
110
ntq_emit_thrsw(c);
111
112
return qir_TEX_RESULT(c);
113
}
114
115
static struct qreg
116
vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
117
{
118
ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);
119
assert(buffer_index == 1);
120
assert(c->stage == QSTAGE_FRAG);
121
122
struct qreg offset = ntq_get_src(c, intr->src[1], 0);
123
124
/* Clamp to [0, array size). Note that MIN/MAX are signed. */
125
offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
126
offset = qir_MIN_NOIMM(c, offset,
127
qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
128
129
qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
130
offset,
131
qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
132
133
c->num_texture_samples++;
134
135
ntq_emit_thrsw(c);
136
137
return qir_TEX_RESULT(c);
138
}
139
140
nir_ssa_def *
141
vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
142
{
143
switch (swiz) {
144
default:
145
case PIPE_SWIZZLE_NONE:
146
fprintf(stderr, "warning: unknown swizzle\n");
147
FALLTHROUGH;
148
case PIPE_SWIZZLE_0:
149
return nir_imm_float(b, 0.0);
150
case PIPE_SWIZZLE_1:
151
return nir_imm_float(b, 1.0);
152
case PIPE_SWIZZLE_X:
153
case PIPE_SWIZZLE_Y:
154
case PIPE_SWIZZLE_Z:
155
case PIPE_SWIZZLE_W:
156
return srcs[swiz];
157
}
158
}
159
160
static struct qreg *
161
ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
162
{
163
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
164
def->num_components);
165
_mesa_hash_table_insert(c->def_ht, def, qregs);
166
return qregs;
167
}
168
169
/**
170
* This function is responsible for getting QIR results into the associated
171
* storage for a NIR instruction.
172
*
173
* If it's a NIR SSA def, then we just set the associated hash table entry to
174
* the new result.
175
*
176
* If it's a NIR reg, then we need to update the existing qreg assigned to the
177
* NIR destination with the incoming value. To do that without introducing
178
* new MOVs, we require that the incoming qreg either be a uniform, or be
179
* SSA-defined by the previous QIR instruction in the block and rewritable by
180
* this function. That lets us sneak ahead and insert the SF flag beforehand
181
* (knowing that the previous instruction doesn't depend on flags) and rewrite
182
* its destination to be the NIR reg's destination
183
*/
184
static void
185
ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
186
struct qreg result)
187
{
188
struct qinst *last_inst = NULL;
189
if (!list_is_empty(&c->cur_block->instructions))
190
last_inst = (struct qinst *)c->cur_block->instructions.prev;
191
192
assert(result.file == QFILE_UNIF ||
193
(result.file == QFILE_TEMP &&
194
last_inst && last_inst == c->defs[result.index]));
195
196
if (dest->is_ssa) {
197
assert(chan < dest->ssa.num_components);
198
199
struct qreg *qregs;
200
struct hash_entry *entry =
201
_mesa_hash_table_search(c->def_ht, &dest->ssa);
202
203
if (entry)
204
qregs = entry->data;
205
else
206
qregs = ntq_init_ssa_def(c, &dest->ssa);
207
208
qregs[chan] = result;
209
} else {
210
nir_register *reg = dest->reg.reg;
211
assert(dest->reg.base_offset == 0);
212
assert(reg->num_array_elems == 0);
213
struct hash_entry *entry =
214
_mesa_hash_table_search(c->def_ht, reg);
215
struct qreg *qregs = entry->data;
216
217
/* Insert a MOV if the source wasn't an SSA def in the
218
* previous instruction.
219
*/
220
if (result.file == QFILE_UNIF) {
221
result = qir_MOV(c, result);
222
last_inst = c->defs[result.index];
223
}
224
225
/* We know they're both temps, so just rewrite index. */
226
c->defs[last_inst->dst.index] = NULL;
227
last_inst->dst.index = qregs[chan].index;
228
229
/* If we're in control flow, then make this update of the reg
230
* conditional on the execution mask.
231
*/
232
if (c->execute.file != QFILE_NULL) {
233
last_inst->dst.index = qregs[chan].index;
234
235
/* Set the flags to the current exec mask. To insert
236
* the SF, we temporarily remove our SSA instruction.
237
*/
238
list_del(&last_inst->link);
239
qir_SF(c, c->execute);
240
list_addtail(&last_inst->link,
241
&c->cur_block->instructions);
242
243
last_inst->cond = QPU_COND_ZS;
244
last_inst->cond_is_exec_mask = true;
245
}
246
}
247
}
248
249
static struct qreg *
250
ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
251
{
252
if (dest->is_ssa) {
253
struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa);
254
for (int i = 0; i < dest->ssa.num_components; i++)
255
qregs[i] = c->undef;
256
return qregs;
257
} else {
258
nir_register *reg = dest->reg.reg;
259
assert(dest->reg.base_offset == 0);
260
assert(reg->num_array_elems == 0);
261
struct hash_entry *entry =
262
_mesa_hash_table_search(c->def_ht, reg);
263
return entry->data;
264
}
265
}
266
267
static struct qreg
268
ntq_get_src(struct vc4_compile *c, nir_src src, int i)
269
{
270
struct hash_entry *entry;
271
if (src.is_ssa) {
272
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
273
assert(i < src.ssa->num_components);
274
} else {
275
nir_register *reg = src.reg.reg;
276
entry = _mesa_hash_table_search(c->def_ht, reg);
277
assert(reg->num_array_elems == 0);
278
assert(src.reg.base_offset == 0);
279
assert(i < reg->num_components);
280
}
281
282
struct qreg *qregs = entry->data;
283
return qregs[i];
284
}
285
286
static struct qreg
287
ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
288
unsigned src)
289
{
290
assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
291
unsigned chan = ffs(instr->dest.write_mask) - 1;
292
struct qreg r = ntq_get_src(c, instr->src[src].src,
293
instr->src[src].swizzle[chan]);
294
295
assert(!instr->src[src].abs);
296
assert(!instr->src[src].negate);
297
298
return r;
299
};
300
301
static inline struct qreg
302
qir_SAT(struct vc4_compile *c, struct qreg val)
303
{
304
return qir_FMAX(c,
305
qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
306
qir_uniform_f(c, 0.0));
307
}
308
309
static struct qreg
310
ntq_rcp(struct vc4_compile *c, struct qreg x)
311
{
312
struct qreg r = qir_RCP(c, x);
313
314
/* Apply a Newton-Raphson step to improve the accuracy. */
315
r = qir_FMUL(c, r, qir_FSUB(c,
316
qir_uniform_f(c, 2.0),
317
qir_FMUL(c, x, r)));
318
319
return r;
320
}
321
322
static struct qreg
323
ntq_rsq(struct vc4_compile *c, struct qreg x)
324
{
325
struct qreg r = qir_RSQ(c, x);
326
327
/* Apply a Newton-Raphson step to improve the accuracy. */
328
r = qir_FMUL(c, r, qir_FSUB(c,
329
qir_uniform_f(c, 1.5),
330
qir_FMUL(c,
331
qir_uniform_f(c, 0.5),
332
qir_FMUL(c, x,
333
qir_FMUL(c, r, r)))));
334
335
return r;
336
}
337
338
static struct qreg
339
ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
340
{
341
struct qreg src0_hi = qir_SHR(c, src0,
342
qir_uniform_ui(c, 24));
343
struct qreg src1_hi = qir_SHR(c, src1,
344
qir_uniform_ui(c, 24));
345
346
struct qreg hilo = qir_MUL24(c, src0_hi, src1);
347
struct qreg lohi = qir_MUL24(c, src0, src1_hi);
348
struct qreg lolo = qir_MUL24(c, src0, src1);
349
350
return qir_ADD(c, lolo, qir_SHL(c,
351
qir_ADD(c, hilo, lohi),
352
qir_uniform_ui(c, 24)));
353
}
354
355
static struct qreg
356
ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
357
{
358
struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
359
qir_uniform_ui(c, 8)));
360
return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
361
}
362
363
/**
364
* Emits a lowered TXF_MS from an MSAA texture.
365
*
366
* The addressing math has been lowered in NIR, and now we just need to read
367
* it like a UBO.
368
*/
369
static void
370
ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
371
{
372
uint32_t tile_width = 32;
373
uint32_t tile_height = 32;
374
uint32_t tile_size = (tile_height * tile_width *
375
VC4_MAX_SAMPLES * sizeof(uint32_t));
376
377
unsigned unit = instr->texture_index;
378
uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
379
uint32_t w_tiles = w / tile_width;
380
uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
381
uint32_t h_tiles = h / tile_height;
382
uint32_t size = w_tiles * h_tiles * tile_size;
383
384
struct qreg addr;
385
assert(instr->num_srcs == 1);
386
assert(instr->src[0].src_type == nir_tex_src_coord);
387
addr = ntq_get_src(c, instr->src[0].src, 0);
388
389
/* Perform the clamping required by kernel validation. */
390
addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
391
addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
392
393
qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
394
addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
395
396
ntq_emit_thrsw(c);
397
398
struct qreg tex = qir_TEX_RESULT(c);
399
c->num_texture_samples++;
400
401
enum pipe_format format = c->key->tex[unit].format;
402
if (util_format_is_depth_or_stencil(format)) {
403
struct qreg scaled = ntq_scale_depth_texture(c, tex);
404
for (int i = 0; i < 4; i++)
405
ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
406
} else {
407
for (int i = 0; i < 4; i++)
408
ntq_store_dest(c, &instr->dest, i,
409
qir_UNPACK_8_F(c, tex, i));
410
}
411
}
412
413
static void
414
ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
415
{
416
struct qreg s, t, r, lod, compare;
417
bool is_txb = false, is_txl = false;
418
unsigned unit = instr->texture_index;
419
420
if (instr->op == nir_texop_txf) {
421
ntq_emit_txf(c, instr);
422
return;
423
}
424
425
for (unsigned i = 0; i < instr->num_srcs; i++) {
426
switch (instr->src[i].src_type) {
427
case nir_tex_src_coord:
428
s = ntq_get_src(c, instr->src[i].src, 0);
429
if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
430
t = qir_uniform_f(c, 0.5);
431
else
432
t = ntq_get_src(c, instr->src[i].src, 1);
433
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
434
r = ntq_get_src(c, instr->src[i].src, 2);
435
break;
436
case nir_tex_src_bias:
437
lod = ntq_get_src(c, instr->src[i].src, 0);
438
is_txb = true;
439
break;
440
case nir_tex_src_lod:
441
lod = ntq_get_src(c, instr->src[i].src, 0);
442
is_txl = true;
443
break;
444
case nir_tex_src_comparator:
445
compare = ntq_get_src(c, instr->src[i].src, 0);
446
break;
447
default:
448
unreachable("unknown texture source");
449
}
450
}
451
452
if (c->stage != QSTAGE_FRAG && !is_txl) {
453
/* From the GLSL 1.20 spec:
454
*
455
* "If it is mip-mapped and running on the vertex shader,
456
* then the base texture is used."
457
*/
458
is_txl = true;
459
lod = qir_uniform_ui(c, 0);
460
}
461
462
if (c->key->tex[unit].force_first_level) {
463
lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
464
is_txl = true;
465
is_txb = false;
466
}
467
468
struct qreg texture_u[] = {
469
qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
470
qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
471
qir_uniform(c, QUNIFORM_CONSTANT, 0),
472
qir_uniform(c, QUNIFORM_CONSTANT, 0),
473
};
474
uint32_t next_texture_u = 0;
475
476
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
477
texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
478
unit | (is_txl << 16));
479
}
480
481
struct qinst *tmu;
482
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
483
tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
484
tmu->src[qir_get_tex_uniform_src(tmu)] =
485
texture_u[next_texture_u++];
486
} else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
487
c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
488
c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
489
c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
490
tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
491
qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
492
unit));
493
tmu->src[qir_get_tex_uniform_src(tmu)] =
494
texture_u[next_texture_u++];
495
}
496
497
if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
498
s = qir_SAT(c, s);
499
}
500
501
if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
502
t = qir_SAT(c, t);
503
}
504
505
tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
506
tmu->src[qir_get_tex_uniform_src(tmu)] =
507
texture_u[next_texture_u++];
508
509
if (is_txl || is_txb) {
510
tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
511
tmu->src[qir_get_tex_uniform_src(tmu)] =
512
texture_u[next_texture_u++];
513
}
514
515
tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
516
tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
517
518
c->num_texture_samples++;
519
520
ntq_emit_thrsw(c);
521
522
struct qreg tex = qir_TEX_RESULT(c);
523
524
enum pipe_format format = c->key->tex[unit].format;
525
526
struct qreg *dest = ntq_get_dest(c, &instr->dest);
527
if (util_format_is_depth_or_stencil(format)) {
528
struct qreg normalized = ntq_scale_depth_texture(c, tex);
529
struct qreg depth_output;
530
531
struct qreg u0 = qir_uniform_f(c, 0.0f);
532
struct qreg u1 = qir_uniform_f(c, 1.0f);
533
if (c->key->tex[unit].compare_mode) {
534
/* From the GL_ARB_shadow spec:
535
*
536
* "Let Dt (D subscript t) be the depth texture
537
* value, in the range [0, 1]. Let R be the
538
* interpolated texture coordinate clamped to the
539
* range [0, 1]."
540
*/
541
compare = qir_SAT(c, compare);
542
543
switch (c->key->tex[unit].compare_func) {
544
case PIPE_FUNC_NEVER:
545
depth_output = qir_uniform_f(c, 0.0f);
546
break;
547
case PIPE_FUNC_ALWAYS:
548
depth_output = u1;
549
break;
550
case PIPE_FUNC_EQUAL:
551
qir_SF(c, qir_FSUB(c, compare, normalized));
552
depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
553
break;
554
case PIPE_FUNC_NOTEQUAL:
555
qir_SF(c, qir_FSUB(c, compare, normalized));
556
depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
557
break;
558
case PIPE_FUNC_GREATER:
559
qir_SF(c, qir_FSUB(c, compare, normalized));
560
depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
561
break;
562
case PIPE_FUNC_GEQUAL:
563
qir_SF(c, qir_FSUB(c, normalized, compare));
564
depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
565
break;
566
case PIPE_FUNC_LESS:
567
qir_SF(c, qir_FSUB(c, compare, normalized));
568
depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
569
break;
570
case PIPE_FUNC_LEQUAL:
571
qir_SF(c, qir_FSUB(c, normalized, compare));
572
depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
573
break;
574
}
575
} else {
576
depth_output = normalized;
577
}
578
579
for (int i = 0; i < 4; i++)
580
dest[i] = depth_output;
581
} else {
582
for (int i = 0; i < 4; i++)
583
dest[i] = qir_UNPACK_8_F(c, tex, i);
584
}
585
}
586
587
/**
588
* Computes x - floor(x), which is tricky because our FTOI truncates (rounds
589
* to zero).
590
*/
591
static struct qreg
592
ntq_ffract(struct vc4_compile *c, struct qreg src)
593
{
594
struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
595
struct qreg diff = qir_FSUB(c, src, trunc);
596
qir_SF(c, diff);
597
598
qir_FADD_dest(c, diff,
599
diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
600
601
return qir_MOV(c, diff);
602
}
603
604
/**
605
* Computes floor(x), which is tricky because our FTOI truncates (rounds to
606
* zero).
607
*/
608
static struct qreg
609
ntq_ffloor(struct vc4_compile *c, struct qreg src)
610
{
611
struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
612
613
/* This will be < 0 if we truncated and the truncation was of a value
614
* that was < 0 in the first place.
615
*/
616
qir_SF(c, qir_FSUB(c, src, result));
617
618
struct qinst *sub = qir_FSUB_dest(c, result,
619
result, qir_uniform_f(c, 1.0));
620
sub->cond = QPU_COND_NS;
621
622
return qir_MOV(c, result);
623
}
624
625
/**
626
* Computes ceil(x), which is tricky because our FTOI truncates (rounds to
627
* zero).
628
*/
629
static struct qreg
630
ntq_fceil(struct vc4_compile *c, struct qreg src)
631
{
632
struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
633
634
/* This will be < 0 if we truncated and the truncation was of a value
635
* that was > 0 in the first place.
636
*/
637
qir_SF(c, qir_FSUB(c, result, src));
638
639
qir_FADD_dest(c, result,
640
result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
641
642
return qir_MOV(c, result);
643
}
644
645
static struct qreg
646
ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
647
{
648
/* Since we're using a Taylor approximation, we want to have a small
649
* number of coefficients and take advantage of sin/cos repeating
650
* every 2pi. We keep our x as close to 0 as we can, since the series
651
* will be less accurate as |x| increases. (Also, be careful of
652
* shifting the input x value to be tricky with sin/cos relations,
653
* because getting accurate values for x==0 is very important for SDL
654
* rendering)
655
*/
656
struct qreg scaled_x =
657
qir_FMUL(c, x,
658
qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
659
/* Note: FTOI truncates toward 0. */
660
struct qreg x_frac = qir_FSUB(c, scaled_x,
661
qir_ITOF(c, qir_FTOI(c, scaled_x)));
662
/* Map [0.5, 1] to [-0.5, 0] */
663
qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
664
qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
665
/* Map [-1, -0.5] to [0, 0.5] */
666
qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
667
qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
668
669
return x_frac;
670
}
671
672
static struct qreg
673
ntq_fsin(struct vc4_compile *c, struct qreg src)
674
{
675
float coeff[] = {
676
2.0 * M_PI,
677
-pow(2.0 * M_PI, 3) / (3 * 2 * 1),
678
pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
679
-pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
680
pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
681
};
682
683
struct qreg x = ntq_shrink_sincos_input_range(c, src);
684
struct qreg x2 = qir_FMUL(c, x, x);
685
struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
686
for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
687
x = qir_FMUL(c, x, x2);
688
sum = qir_FADD(c,
689
sum,
690
qir_FMUL(c,
691
x,
692
qir_uniform_f(c, coeff[i])));
693
}
694
return sum;
695
}
696
697
static struct qreg
698
ntq_fcos(struct vc4_compile *c, struct qreg src)
699
{
700
float coeff[] = {
701
1.0f,
702
-pow(2.0 * M_PI, 2) / (2 * 1),
703
pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
704
-pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
705
pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
706
-pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
707
};
708
709
struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
710
struct qreg sum = qir_uniform_f(c, coeff[0]);
711
struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
712
struct qreg x = x2; /* Current x^2, x^4, or x^6 */
713
for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
714
if (i != 1)
715
x = qir_FMUL(c, x, x2);
716
717
sum = qir_FADD(c, qir_FMUL(c,
718
x,
719
qir_uniform_f(c, coeff[i])),
720
sum);
721
}
722
return sum;
723
}
724
725
static struct qreg
726
ntq_fsign(struct vc4_compile *c, struct qreg src)
727
{
728
struct qreg t = qir_get_temp(c);
729
730
qir_SF(c, src);
731
qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
732
qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
733
qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
734
return qir_MOV(c, t);
735
}
736
737
static void
738
emit_vertex_input(struct vc4_compile *c, int attr)
739
{
740
enum pipe_format format = c->vs_key->attr_formats[attr];
741
uint32_t attr_size = util_format_get_blocksize(format);
742
743
c->vattr_sizes[attr] = align(attr_size, 4);
744
for (int i = 0; i < align(attr_size, 4) / 4; i++) {
745
c->inputs[attr * 4 + i] =
746
qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
747
c->num_inputs++;
748
}
749
}
750
751
static void
752
emit_fragcoord_input(struct vc4_compile *c, int attr)
753
{
754
c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
755
c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
756
c->inputs[attr * 4 + 2] =
757
qir_FMUL(c,
758
qir_ITOF(c, qir_FRAG_Z(c)),
759
qir_uniform_f(c, 1.0 / 0xffffff));
760
c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
761
}
762
763
static struct qreg
764
emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
765
uint8_t swizzle)
766
{
767
uint32_t i = c->num_input_slots++;
768
struct qreg vary = {
769
QFILE_VARY,
770
i
771
};
772
773
if (c->num_input_slots >= c->input_slots_array_size) {
774
c->input_slots_array_size =
775
MAX2(4, c->input_slots_array_size * 2);
776
777
c->input_slots = reralloc(c, c->input_slots,
778
struct vc4_varying_slot,
779
c->input_slots_array_size);
780
}
781
782
c->input_slots[i].slot = slot;
783
c->input_slots[i].swizzle = swizzle;
784
785
return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
786
}
787
788
static void
789
emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
790
{
791
for (int i = 0; i < 4; i++) {
792
c->inputs[attr * 4 + i] =
793
emit_fragment_varying(c, slot, i);
794
c->num_inputs++;
795
}
796
}
797
798
static void
799
add_output(struct vc4_compile *c,
800
uint32_t decl_offset,
801
uint8_t slot,
802
uint8_t swizzle)
803
{
804
uint32_t old_array_size = c->outputs_array_size;
805
resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
806
decl_offset + 1);
807
808
if (old_array_size != c->outputs_array_size) {
809
c->output_slots = reralloc(c,
810
c->output_slots,
811
struct vc4_varying_slot,
812
c->outputs_array_size);
813
}
814
815
c->output_slots[decl_offset].slot = slot;
816
c->output_slots[decl_offset].swizzle = swizzle;
817
}
818
819
static bool
820
ntq_src_is_only_ssa_def_user(nir_src *src)
821
{
822
if (!src->is_ssa)
823
return false;
824
825
if (!list_is_empty(&src->ssa->if_uses))
826
return false;
827
828
return (src->ssa->uses.next == &src->use_link &&
829
src->ssa->uses.next->next == &src->ssa->uses);
830
}
831
832
/**
833
* In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
834
* bit set.
835
*
836
* However, as an optimization, it tries to find the instructions generating
837
* the sources to be packed and just emit the pack flag there, if possible.
838
*/
839
static void
840
ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
841
{
842
struct qreg result = qir_get_temp(c);
843
struct nir_alu_instr *vec4 = NULL;
844
845
/* If packing from a vec4 op (as expected), identify it so that we can
846
* peek back at what generated its sources.
847
*/
848
if (instr->src[0].src.is_ssa &&
849
instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
850
nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
851
nir_op_vec4) {
852
vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
853
}
854
855
/* If the pack is replicating the same channel 4 times, use the 8888
856
* pack flag. This is common for blending using the alpha
857
* channel.
858
*/
859
if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
860
instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
861
instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
862
struct qreg rep = ntq_get_src(c,
863
instr->src[0].src,
864
instr->src[0].swizzle[0]);
865
ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
866
return;
867
}
868
869
for (int i = 0; i < 4; i++) {
870
int swiz = instr->src[0].swizzle[i];
871
struct qreg src;
872
if (vec4) {
873
src = ntq_get_src(c, vec4->src[swiz].src,
874
vec4->src[swiz].swizzle[0]);
875
} else {
876
src = ntq_get_src(c, instr->src[0].src, swiz);
877
}
878
879
if (vec4 &&
880
ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
881
src.file == QFILE_TEMP &&
882
c->defs[src.index] &&
883
qir_is_mul(c->defs[src.index]) &&
884
!c->defs[src.index]->dst.pack) {
885
struct qinst *rewrite = c->defs[src.index];
886
c->defs[src.index] = NULL;
887
rewrite->dst = result;
888
rewrite->dst.pack = QPU_PACK_MUL_8A + i;
889
continue;
890
}
891
892
qir_PACK_8_F(c, result, src, i);
893
}
894
895
ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
896
}
897
898
/** Handles sign-extended bitfield extracts for 16 bits. */
899
static struct qreg
900
ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
901
struct qreg bits)
902
{
903
assert(bits.file == QFILE_UNIF &&
904
c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
905
c->uniform_data[bits.index] == 16);
906
907
assert(offset.file == QFILE_UNIF &&
908
c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
909
int offset_bit = c->uniform_data[offset.index];
910
assert(offset_bit % 16 == 0);
911
912
return qir_UNPACK_16_I(c, base, offset_bit / 16);
913
}
914
915
/** Handles unsigned bitfield extracts for 8 bits. */
916
static struct qreg
917
ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
918
struct qreg bits)
919
{
920
assert(bits.file == QFILE_UNIF &&
921
c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
922
c->uniform_data[bits.index] == 8);
923
924
assert(offset.file == QFILE_UNIF &&
925
c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
926
int offset_bit = c->uniform_data[offset.index];
927
assert(offset_bit % 8 == 0);
928
929
return qir_UNPACK_8_I(c, base, offset_bit / 8);
930
}
931
932
/**
933
* If compare_instr is a valid comparison instruction, emits the
934
* compare_instr's comparison and returns the sel_instr's return value based
935
* on the compare_instr's result.
936
*/
937
static bool
938
ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
939
nir_alu_instr *compare_instr,
940
nir_alu_instr *sel_instr)
941
{
942
enum qpu_cond cond;
943
944
switch (compare_instr->op) {
945
case nir_op_feq32:
946
case nir_op_ieq32:
947
case nir_op_seq:
948
cond = QPU_COND_ZS;
949
break;
950
case nir_op_fneu32:
951
case nir_op_ine32:
952
case nir_op_sne:
953
cond = QPU_COND_ZC;
954
break;
955
case nir_op_fge32:
956
case nir_op_ige32:
957
case nir_op_uge32:
958
case nir_op_sge:
959
cond = QPU_COND_NC;
960
break;
961
case nir_op_flt32:
962
case nir_op_ilt32:
963
case nir_op_slt:
964
cond = QPU_COND_NS;
965
break;
966
default:
967
return false;
968
}
969
970
struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
971
struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
972
973
unsigned unsized_type =
974
nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
975
if (unsized_type == nir_type_float)
976
qir_SF(c, qir_FSUB(c, src0, src1));
977
else
978
qir_SF(c, qir_SUB(c, src0, src1));
979
980
switch (sel_instr->op) {
981
case nir_op_seq:
982
case nir_op_sne:
983
case nir_op_sge:
984
case nir_op_slt:
985
*dest = qir_SEL(c, cond,
986
qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
987
break;
988
989
case nir_op_b32csel:
990
*dest = qir_SEL(c, cond,
991
ntq_get_alu_src(c, sel_instr, 1),
992
ntq_get_alu_src(c, sel_instr, 2));
993
break;
994
995
default:
996
*dest = qir_SEL(c, cond,
997
qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
998
break;
999
}
1000
1001
/* Make the temporary for nir_store_dest(). */
1002
*dest = qir_MOV(c, *dest);
1003
1004
return true;
1005
}
1006
1007
/**
1008
* Attempts to fold a comparison generating a boolean result into the
1009
* condition code for selecting between two values, instead of comparing the
1010
* boolean result against 0 to generate the condition code.
1011
*/
1012
static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
1013
struct qreg *src)
1014
{
1015
if (!instr->src[0].src.is_ssa)
1016
goto out;
1017
if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
1018
goto out;
1019
nir_alu_instr *compare =
1020
nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1021
if (!compare)
1022
goto out;
1023
1024
struct qreg dest;
1025
if (ntq_emit_comparison(c, &dest, compare, instr))
1026
return dest;
1027
1028
out:
1029
qir_SF(c, src[0]);
1030
return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
1031
}
1032
1033
static struct qreg
1034
ntq_fddx(struct vc4_compile *c, struct qreg src)
1035
{
1036
/* Make sure that we have a bare temp to use for MUL rotation, so it
1037
* can be allocated to an accumulator.
1038
*/
1039
if (src.pack || src.file != QFILE_TEMP)
1040
src = qir_MOV(c, src);
1041
1042
struct qreg from_left = qir_ROT_MUL(c, src, 1);
1043
struct qreg from_right = qir_ROT_MUL(c, src, 15);
1044
1045
/* Distinguish left/right pixels of the quad. */
1046
qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
1047
qir_uniform_ui(c, 1)));
1048
1049
return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1050
qir_FSUB(c, from_right, src),
1051
qir_FSUB(c, src, from_left)));
1052
}
1053
1054
static struct qreg
1055
ntq_fddy(struct vc4_compile *c, struct qreg src)
1056
{
1057
if (src.pack || src.file != QFILE_TEMP)
1058
src = qir_MOV(c, src);
1059
1060
struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
1061
struct qreg from_top = qir_ROT_MUL(c, src, 14);
1062
1063
/* Distinguish top/bottom pixels of the quad. */
1064
qir_SF(c, qir_AND(c,
1065
qir_reg(QFILE_QPU_ELEMENT, 0),
1066
qir_uniform_ui(c, 2)));
1067
1068
return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1069
qir_FSUB(c, from_top, src),
1070
qir_FSUB(c, src, from_bottom)));
1071
}
1072
1073
static void
1074
ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
1075
{
1076
/* This should always be lowered to ALU operations for VC4. */
1077
assert(!instr->dest.saturate);
1078
1079
/* Vectors are special in that they have non-scalarized writemasks,
1080
* and just take the first swizzle channel for each argument in order
1081
* into each writemask channel.
1082
*/
1083
if (instr->op == nir_op_vec2 ||
1084
instr->op == nir_op_vec3 ||
1085
instr->op == nir_op_vec4) {
1086
struct qreg srcs[4];
1087
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1088
srcs[i] = ntq_get_src(c, instr->src[i].src,
1089
instr->src[i].swizzle[0]);
1090
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1091
ntq_store_dest(c, &instr->dest.dest, i,
1092
qir_MOV(c, srcs[i]));
1093
return;
1094
}
1095
1096
if (instr->op == nir_op_pack_unorm_4x8) {
1097
ntq_emit_pack_unorm_4x8(c, instr);
1098
return;
1099
}
1100
1101
if (instr->op == nir_op_unpack_unorm_4x8) {
1102
struct qreg src = ntq_get_src(c, instr->src[0].src,
1103
instr->src[0].swizzle[0]);
1104
for (int i = 0; i < 4; i++) {
1105
if (instr->dest.write_mask & (1 << i))
1106
ntq_store_dest(c, &instr->dest.dest, i,
1107
qir_UNPACK_8_F(c, src, i));
1108
}
1109
return;
1110
}
1111
1112
/* General case: We can just grab the one used channel per src. */
1113
struct qreg src[nir_op_infos[instr->op].num_inputs];
1114
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1115
src[i] = ntq_get_alu_src(c, instr, i);
1116
}
1117
1118
struct qreg result;
1119
1120
switch (instr->op) {
1121
case nir_op_mov:
1122
result = qir_MOV(c, src[0]);
1123
break;
1124
case nir_op_fmul:
1125
result = qir_FMUL(c, src[0], src[1]);
1126
break;
1127
case nir_op_fadd:
1128
result = qir_FADD(c, src[0], src[1]);
1129
break;
1130
case nir_op_fsub:
1131
result = qir_FSUB(c, src[0], src[1]);
1132
break;
1133
case nir_op_fmin:
1134
result = qir_FMIN(c, src[0], src[1]);
1135
break;
1136
case nir_op_fmax:
1137
result = qir_FMAX(c, src[0], src[1]);
1138
break;
1139
1140
case nir_op_f2i32:
1141
case nir_op_f2u32:
1142
result = qir_FTOI(c, src[0]);
1143
break;
1144
case nir_op_i2f32:
1145
case nir_op_u2f32:
1146
result = qir_ITOF(c, src[0]);
1147
break;
1148
case nir_op_b2f32:
1149
result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
1150
break;
1151
case nir_op_b2i32:
1152
result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
1153
break;
1154
case nir_op_i2b32:
1155
case nir_op_f2b32:
1156
qir_SF(c, src[0]);
1157
result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
1158
qir_uniform_ui(c, ~0),
1159
qir_uniform_ui(c, 0)));
1160
break;
1161
1162
case nir_op_iadd:
1163
result = qir_ADD(c, src[0], src[1]);
1164
break;
1165
case nir_op_ushr:
1166
result = qir_SHR(c, src[0], src[1]);
1167
break;
1168
case nir_op_isub:
1169
result = qir_SUB(c, src[0], src[1]);
1170
break;
1171
case nir_op_ishr:
1172
result = qir_ASR(c, src[0], src[1]);
1173
break;
1174
case nir_op_ishl:
1175
result = qir_SHL(c, src[0], src[1]);
1176
break;
1177
case nir_op_imin:
1178
result = qir_MIN(c, src[0], src[1]);
1179
break;
1180
case nir_op_imax:
1181
result = qir_MAX(c, src[0], src[1]);
1182
break;
1183
case nir_op_iand:
1184
result = qir_AND(c, src[0], src[1]);
1185
break;
1186
case nir_op_ior:
1187
result = qir_OR(c, src[0], src[1]);
1188
break;
1189
case nir_op_ixor:
1190
result = qir_XOR(c, src[0], src[1]);
1191
break;
1192
case nir_op_inot:
1193
result = qir_NOT(c, src[0]);
1194
break;
1195
1196
case nir_op_imul:
1197
result = ntq_umul(c, src[0], src[1]);
1198
break;
1199
1200
case nir_op_seq:
1201
case nir_op_sne:
1202
case nir_op_sge:
1203
case nir_op_slt:
1204
case nir_op_feq32:
1205
case nir_op_fneu32:
1206
case nir_op_fge32:
1207
case nir_op_flt32:
1208
case nir_op_ieq32:
1209
case nir_op_ine32:
1210
case nir_op_ige32:
1211
case nir_op_uge32:
1212
case nir_op_ilt32:
1213
if (!ntq_emit_comparison(c, &result, instr, instr)) {
1214
fprintf(stderr, "Bad comparison instruction\n");
1215
}
1216
break;
1217
1218
case nir_op_b32csel:
1219
result = ntq_emit_bcsel(c, instr, src);
1220
break;
1221
case nir_op_fcsel:
1222
qir_SF(c, src[0]);
1223
result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
1224
break;
1225
1226
case nir_op_frcp:
1227
result = ntq_rcp(c, src[0]);
1228
break;
1229
case nir_op_frsq:
1230
result = ntq_rsq(c, src[0]);
1231
break;
1232
case nir_op_fexp2:
1233
result = qir_EXP2(c, src[0]);
1234
break;
1235
case nir_op_flog2:
1236
result = qir_LOG2(c, src[0]);
1237
break;
1238
1239
case nir_op_ftrunc:
1240
result = qir_ITOF(c, qir_FTOI(c, src[0]));
1241
break;
1242
case nir_op_fceil:
1243
result = ntq_fceil(c, src[0]);
1244
break;
1245
case nir_op_ffract:
1246
result = ntq_ffract(c, src[0]);
1247
break;
1248
case nir_op_ffloor:
1249
result = ntq_ffloor(c, src[0]);
1250
break;
1251
1252
case nir_op_fsin:
1253
result = ntq_fsin(c, src[0]);
1254
break;
1255
case nir_op_fcos:
1256
result = ntq_fcos(c, src[0]);
1257
break;
1258
1259
case nir_op_fsign:
1260
result = ntq_fsign(c, src[0]);
1261
break;
1262
1263
case nir_op_fabs:
1264
result = qir_FMAXABS(c, src[0], src[0]);
1265
break;
1266
case nir_op_iabs:
1267
result = qir_MAX(c, src[0],
1268
qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1269
break;
1270
1271
case nir_op_ibitfield_extract:
1272
result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
1273
break;
1274
1275
case nir_op_ubitfield_extract:
1276
result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
1277
break;
1278
1279
case nir_op_usadd_4x8_vc4:
1280
result = qir_V8ADDS(c, src[0], src[1]);
1281
break;
1282
1283
case nir_op_ussub_4x8_vc4:
1284
result = qir_V8SUBS(c, src[0], src[1]);
1285
break;
1286
1287
case nir_op_umin_4x8_vc4:
1288
result = qir_V8MIN(c, src[0], src[1]);
1289
break;
1290
1291
case nir_op_umax_4x8_vc4:
1292
result = qir_V8MAX(c, src[0], src[1]);
1293
break;
1294
1295
case nir_op_umul_unorm_4x8_vc4:
1296
result = qir_V8MULD(c, src[0], src[1]);
1297
break;
1298
1299
case nir_op_fddx:
1300
case nir_op_fddx_coarse:
1301
case nir_op_fddx_fine:
1302
result = ntq_fddx(c, src[0]);
1303
break;
1304
1305
case nir_op_fddy:
1306
case nir_op_fddy_coarse:
1307
case nir_op_fddy_fine:
1308
result = ntq_fddy(c, src[0]);
1309
break;
1310
1311
default:
1312
fprintf(stderr, "unknown NIR ALU inst: ");
1313
nir_print_instr(&instr->instr, stderr);
1314
fprintf(stderr, "\n");
1315
abort();
1316
}
1317
1318
/* We have a scalar result, so the instruction should only have a
1319
* single channel written to.
1320
*/
1321
assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1322
ntq_store_dest(c, &instr->dest.dest,
1323
ffs(instr->dest.write_mask) - 1, result);
1324
}
1325
1326
static void
1327
emit_frag_end(struct vc4_compile *c)
1328
{
1329
struct qreg color;
1330
if (c->output_color_index != -1) {
1331
color = c->outputs[c->output_color_index];
1332
} else {
1333
color = qir_uniform_ui(c, 0);
1334
}
1335
1336
uint32_t discard_cond = QPU_COND_ALWAYS;
1337
if (c->s->info.fs.uses_discard) {
1338
qir_SF(c, c->discard);
1339
discard_cond = QPU_COND_ZS;
1340
}
1341
1342
if (c->fs_key->stencil_enabled) {
1343
qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1344
qir_uniform(c, QUNIFORM_STENCIL, 0));
1345
if (c->fs_key->stencil_twoside) {
1346
qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1347
qir_uniform(c, QUNIFORM_STENCIL, 1));
1348
}
1349
if (c->fs_key->stencil_full_writemasks) {
1350
qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1351
qir_uniform(c, QUNIFORM_STENCIL, 2));
1352
}
1353
}
1354
1355
if (c->output_sample_mask_index != -1) {
1356
qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1357
}
1358
1359
if (c->fs_key->depth_enabled) {
1360
if (c->output_position_index != -1) {
1361
qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1362
qir_FMUL(c,
1363
c->outputs[c->output_position_index],
1364
qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
1365
} else {
1366
qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1367
qir_FRAG_Z(c))->cond = discard_cond;
1368
}
1369
}
1370
1371
if (!c->msaa_per_sample_output) {
1372
qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
1373
color)->cond = discard_cond;
1374
} else {
1375
for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
1376
qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
1377
c->sample_colors[i])->cond = discard_cond;
1378
}
1379
}
1380
}
1381
1382
static void
1383
emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1384
{
1385
struct qreg packed = qir_get_temp(c);
1386
1387
for (int i = 0; i < 2; i++) {
1388
struct qreg scale =
1389
qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1390
1391
struct qreg packed_chan = packed;
1392
packed_chan.pack = QPU_PACK_A_16A + i;
1393
1394
qir_FTOI_dest(c, packed_chan,
1395
qir_FMUL(c,
1396
qir_FMUL(c,
1397
c->outputs[c->output_position_index + i],
1398
scale),
1399
rcp_w));
1400
}
1401
1402
qir_VPM_WRITE(c, packed);
1403
}
1404
1405
static void
1406
emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1407
{
1408
struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1409
struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1410
1411
qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1412
c->outputs[c->output_position_index + 2],
1413
zscale),
1414
rcp_w),
1415
zoffset));
1416
}
1417
1418
static void
1419
emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1420
{
1421
qir_VPM_WRITE(c, rcp_w);
1422
}
1423
1424
static void
1425
emit_point_size_write(struct vc4_compile *c)
1426
{
1427
struct qreg point_size;
1428
1429
if (c->output_point_size_index != -1)
1430
point_size = c->outputs[c->output_point_size_index];
1431
else
1432
point_size = qir_uniform_f(c, 1.0);
1433
1434
qir_VPM_WRITE(c, point_size);
1435
}
1436
1437
/**
1438
* Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1439
*
1440
* The simulator insists that there be at least one vertex attribute, so
1441
* vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also
1442
* insists that all vertex attributes loaded get read by the VS/CS, so we have
1443
* to consume it here.
1444
*/
1445
static void
1446
emit_stub_vpm_read(struct vc4_compile *c)
1447
{
1448
if (c->num_inputs)
1449
return;
1450
1451
c->vattr_sizes[0] = 4;
1452
(void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
1453
c->num_inputs++;
1454
}
1455
1456
static void
1457
emit_vert_end(struct vc4_compile *c,
1458
struct vc4_varying_slot *fs_inputs,
1459
uint32_t num_fs_inputs)
1460
{
1461
struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1462
1463
emit_stub_vpm_read(c);
1464
1465
emit_scaled_viewport_write(c, rcp_w);
1466
emit_zs_write(c, rcp_w);
1467
emit_rcp_wc_write(c, rcp_w);
1468
if (c->vs_key->per_vertex_point_size)
1469
emit_point_size_write(c);
1470
1471
for (int i = 0; i < num_fs_inputs; i++) {
1472
struct vc4_varying_slot *input = &fs_inputs[i];
1473
int j;
1474
1475
for (j = 0; j < c->num_outputs; j++) {
1476
struct vc4_varying_slot *output =
1477
&c->output_slots[j];
1478
1479
if (input->slot == output->slot &&
1480
input->swizzle == output->swizzle) {
1481
qir_VPM_WRITE(c, c->outputs[j]);
1482
break;
1483
}
1484
}
1485
/* Emit padding if we didn't find a declared VS output for
1486
* this FS input.
1487
*/
1488
if (j == c->num_outputs)
1489
qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1490
}
1491
}
1492
1493
static void
1494
emit_coord_end(struct vc4_compile *c)
1495
{
1496
struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1497
1498
emit_stub_vpm_read(c);
1499
1500
for (int i = 0; i < 4; i++)
1501
qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1502
1503
emit_scaled_viewport_write(c, rcp_w);
1504
emit_zs_write(c, rcp_w);
1505
emit_rcp_wc_write(c, rcp_w);
1506
if (c->vs_key->per_vertex_point_size)
1507
emit_point_size_write(c);
1508
}
1509
1510
static void
1511
vc4_optimize_nir(struct nir_shader *s)
1512
{
1513
bool progress;
1514
unsigned lower_flrp =
1515
(s->options->lower_flrp16 ? 16 : 0) |
1516
(s->options->lower_flrp32 ? 32 : 0) |
1517
(s->options->lower_flrp64 ? 64 : 0);
1518
1519
do {
1520
progress = false;
1521
1522
NIR_PASS_V(s, nir_lower_vars_to_ssa);
1523
NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1524
NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1525
NIR_PASS(progress, s, nir_copy_prop);
1526
NIR_PASS(progress, s, nir_opt_remove_phis);
1527
NIR_PASS(progress, s, nir_opt_dce);
1528
NIR_PASS(progress, s, nir_opt_dead_cf);
1529
NIR_PASS(progress, s, nir_opt_cse);
1530
NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1531
NIR_PASS(progress, s, nir_opt_algebraic);
1532
NIR_PASS(progress, s, nir_opt_constant_folding);
1533
if (lower_flrp != 0) {
1534
bool lower_flrp_progress = false;
1535
1536
NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1537
lower_flrp,
1538
false /* always_precise */);
1539
if (lower_flrp_progress) {
1540
NIR_PASS(progress, s, nir_opt_constant_folding);
1541
progress = true;
1542
}
1543
1544
/* Nothing should rematerialize any flrps, so we only
1545
* need to do this lowering once.
1546
*/
1547
lower_flrp = 0;
1548
}
1549
1550
NIR_PASS(progress, s, nir_opt_undef);
1551
NIR_PASS(progress, s, nir_opt_loop_unroll,
1552
nir_var_shader_in |
1553
nir_var_shader_out |
1554
nir_var_function_temp);
1555
} while (progress);
1556
}
1557
1558
static int
1559
driver_location_compare(const void *in_a, const void *in_b)
1560
{
1561
const nir_variable *const *a = in_a;
1562
const nir_variable *const *b = in_b;
1563
1564
return (*a)->data.driver_location - (*b)->data.driver_location;
1565
}
1566
1567
static void
1568
ntq_setup_inputs(struct vc4_compile *c)
1569
{
1570
unsigned num_entries = 0;
1571
nir_foreach_shader_in_variable(var, c->s)
1572
num_entries++;
1573
1574
nir_variable *vars[num_entries];
1575
1576
unsigned i = 0;
1577
nir_foreach_shader_in_variable(var, c->s)
1578
vars[i++] = var;
1579
1580
/* Sort the variables so that we emit the input setup in
1581
* driver_location order. This is required for VPM reads, whose data
1582
* is fetched into the VPM in driver_location (TGSI register index)
1583
* order.
1584
*/
1585
qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1586
1587
for (unsigned i = 0; i < num_entries; i++) {
1588
nir_variable *var = vars[i];
1589
unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1590
unsigned loc = var->data.driver_location;
1591
1592
assert(array_len == 1);
1593
(void)array_len;
1594
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1595
(loc + 1) * 4);
1596
1597
if (c->stage == QSTAGE_FRAG) {
1598
if (var->data.location == VARYING_SLOT_POS) {
1599
emit_fragcoord_input(c, loc);
1600
} else if (util_varying_is_point_coord(var->data.location,
1601
c->fs_key->point_sprite_mask)) {
1602
c->inputs[loc * 4 + 0] = c->point_x;
1603
c->inputs[loc * 4 + 1] = c->point_y;
1604
} else {
1605
emit_fragment_input(c, loc, var->data.location);
1606
}
1607
} else {
1608
emit_vertex_input(c, loc);
1609
}
1610
}
1611
}
1612
1613
static void
1614
ntq_setup_outputs(struct vc4_compile *c)
1615
{
1616
nir_foreach_shader_out_variable(var, c->s) {
1617
unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1618
unsigned loc = var->data.driver_location * 4;
1619
1620
assert(array_len == 1);
1621
(void)array_len;
1622
1623
for (int i = 0; i < 4; i++)
1624
add_output(c, loc + i, var->data.location, i);
1625
1626
if (c->stage == QSTAGE_FRAG) {
1627
switch (var->data.location) {
1628
case FRAG_RESULT_COLOR:
1629
case FRAG_RESULT_DATA0:
1630
c->output_color_index = loc;
1631
break;
1632
case FRAG_RESULT_DEPTH:
1633
c->output_position_index = loc;
1634
break;
1635
case FRAG_RESULT_SAMPLE_MASK:
1636
c->output_sample_mask_index = loc;
1637
break;
1638
}
1639
} else {
1640
switch (var->data.location) {
1641
case VARYING_SLOT_POS:
1642
c->output_position_index = loc;
1643
break;
1644
case VARYING_SLOT_PSIZ:
1645
c->output_point_size_index = loc;
1646
break;
1647
}
1648
}
1649
}
1650
}
1651
1652
/**
1653
* Sets up the mapping from nir_register to struct qreg *.
1654
*
1655
* Each nir_register gets a struct qreg per 32-bit component being stored.
1656
*/
1657
static void
1658
ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
1659
{
1660
foreach_list_typed(nir_register, nir_reg, node, list) {
1661
unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1662
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1663
array_len *
1664
nir_reg->num_components);
1665
1666
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1667
1668
for (int i = 0; i < array_len * nir_reg->num_components; i++)
1669
qregs[i] = qir_get_temp(c);
1670
}
1671
}
1672
1673
static void
1674
ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1675
{
1676
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1677
for (int i = 0; i < instr->def.num_components; i++)
1678
qregs[i] = qir_uniform_ui(c, instr->value[i].u32);
1679
1680
_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1681
}
1682
1683
static void
1684
ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
1685
{
1686
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1687
1688
/* QIR needs there to be *some* value, so pick 0 (same as for
1689
* ntq_setup_registers().
1690
*/
1691
for (int i = 0; i < instr->def.num_components; i++)
1692
qregs[i] = qir_uniform_ui(c, 0);
1693
}
1694
1695
static void
1696
ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
1697
{
1698
assert(nir_src_as_uint(instr->src[0]) == 0);
1699
1700
/* Reads of the per-sample color need to be done in
1701
* order.
1702
*/
1703
int sample_index = (nir_intrinsic_base(instr) -
1704
VC4_NIR_TLB_COLOR_READ_INPUT);
1705
for (int i = 0; i <= sample_index; i++) {
1706
if (c->color_reads[i].file == QFILE_NULL) {
1707
c->color_reads[i] =
1708
qir_TLB_COLOR_READ(c);
1709
}
1710
}
1711
ntq_store_dest(c, &instr->dest, 0,
1712
qir_MOV(c, c->color_reads[sample_index]));
1713
}
1714
1715
static void
1716
ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
1717
{
1718
assert(instr->num_components == 1);
1719
assert(nir_src_is_const(instr->src[0]) &&
1720
"vc4 doesn't support indirect inputs");
1721
1722
if (c->stage == QSTAGE_FRAG &&
1723
nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
1724
ntq_emit_color_read(c, instr);
1725
return;
1726
}
1727
1728
uint32_t offset = nir_intrinsic_base(instr) +
1729
nir_src_as_uint(instr->src[0]);
1730
int comp = nir_intrinsic_component(instr);
1731
ntq_store_dest(c, &instr->dest, 0,
1732
qir_MOV(c, c->inputs[offset * 4 + comp]));
1733
}
1734
1735
static void
1736
ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1737
{
1738
unsigned offset;
1739
1740
switch (instr->intrinsic) {
1741
case nir_intrinsic_load_uniform:
1742
assert(instr->num_components == 1);
1743
if (nir_src_is_const(instr->src[0])) {
1744
offset = nir_intrinsic_base(instr) +
1745
nir_src_as_uint(instr->src[0]);
1746
assert(offset % 4 == 0);
1747
/* We need dwords */
1748
offset = offset / 4;
1749
ntq_store_dest(c, &instr->dest, 0,
1750
qir_uniform(c, QUNIFORM_UNIFORM,
1751
offset));
1752
} else {
1753
ntq_store_dest(c, &instr->dest, 0,
1754
indirect_uniform_load(c, instr));
1755
}
1756
break;
1757
1758
case nir_intrinsic_load_ubo:
1759
assert(instr->num_components == 1);
1760
ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr));
1761
break;
1762
1763
case nir_intrinsic_load_user_clip_plane:
1764
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
1765
ntq_store_dest(c, &instr->dest, i,
1766
qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1767
nir_intrinsic_ucp_id(instr) *
1768
4 + i));
1769
}
1770
break;
1771
1772
case nir_intrinsic_load_blend_const_color_r_float:
1773
case nir_intrinsic_load_blend_const_color_g_float:
1774
case nir_intrinsic_load_blend_const_color_b_float:
1775
case nir_intrinsic_load_blend_const_color_a_float:
1776
ntq_store_dest(c, &instr->dest, 0,
1777
qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
1778
(instr->intrinsic -
1779
nir_intrinsic_load_blend_const_color_r_float),
1780
0));
1781
break;
1782
1783
case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
1784
ntq_store_dest(c, &instr->dest, 0,
1785
qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
1786
0));
1787
break;
1788
1789
case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
1790
ntq_store_dest(c, &instr->dest, 0,
1791
qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
1792
0));
1793
break;
1794
1795
case nir_intrinsic_load_sample_mask_in:
1796
ntq_store_dest(c, &instr->dest, 0,
1797
qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1798
break;
1799
1800
case nir_intrinsic_load_front_face:
1801
/* The register contains 0 (front) or 1 (back), and we need to
1802
* turn it into a NIR bool where true means front.
1803
*/
1804
ntq_store_dest(c, &instr->dest, 0,
1805
qir_ADD(c,
1806
qir_uniform_ui(c, -1),
1807
qir_reg(QFILE_FRAG_REV_FLAG, 0)));
1808
break;
1809
1810
case nir_intrinsic_load_input:
1811
ntq_emit_load_input(c, instr);
1812
break;
1813
1814
case nir_intrinsic_store_output:
1815
assert(nir_src_is_const(instr->src[1]) &&
1816
"vc4 doesn't support indirect outputs");
1817
offset = nir_intrinsic_base(instr) +
1818
nir_src_as_uint(instr->src[1]);
1819
1820
/* MSAA color outputs are the only case where we have an
1821
* output that's not lowered to being a store of a single 32
1822
* bit value.
1823
*/
1824
if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
1825
assert(offset == c->output_color_index);
1826
for (int i = 0; i < 4; i++) {
1827
c->sample_colors[i] =
1828
qir_MOV(c, ntq_get_src(c, instr->src[0],
1829
i));
1830
}
1831
} else {
1832
offset = offset * 4 + nir_intrinsic_component(instr);
1833
assert(instr->num_components == 1);
1834
c->outputs[offset] =
1835
qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
1836
c->num_outputs = MAX2(c->num_outputs, offset + 1);
1837
}
1838
break;
1839
1840
case nir_intrinsic_discard:
1841
if (c->execute.file != QFILE_NULL) {
1842
qir_SF(c, c->execute);
1843
qir_MOV_cond(c, QPU_COND_ZS, c->discard,
1844
qir_uniform_ui(c, ~0));
1845
} else {
1846
qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
1847
}
1848
break;
1849
1850
case nir_intrinsic_discard_if: {
1851
/* true (~0) if we're discarding */
1852
struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1853
1854
if (c->execute.file != QFILE_NULL) {
1855
/* execute == 0 means the channel is active. Invert
1856
* the condition so that we can use zero as "executing
1857
* and discarding."
1858
*/
1859
qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
1860
qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
1861
} else {
1862
qir_OR_dest(c, c->discard, c->discard,
1863
ntq_get_src(c, instr->src[0], 0));
1864
}
1865
1866
break;
1867
}
1868
1869
case nir_intrinsic_load_texture_rect_scaling: {
1870
assert(nir_src_is_const(instr->src[0]));
1871
int sampler = nir_src_as_int(instr->src[0]);
1872
1873
ntq_store_dest(c, &instr->dest, 0,
1874
qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));
1875
ntq_store_dest(c, &instr->dest, 1,
1876
qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));
1877
break;
1878
}
1879
1880
default:
1881
fprintf(stderr, "Unknown intrinsic: ");
1882
nir_print_instr(&instr->instr, stderr);
1883
fprintf(stderr, "\n");
1884
break;
1885
}
1886
}
1887
1888
/* Clears (activates) the execute flags for any channels whose jump target
1889
* matches this block.
1890
*/
1891
static void
1892
ntq_activate_execute_for_block(struct vc4_compile *c)
1893
{
1894
qir_SF(c, qir_SUB(c,
1895
c->execute,
1896
qir_uniform_ui(c, c->cur_block->index)));
1897
qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
1898
}
1899
1900
static void
1901
ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1902
{
1903
if (!c->vc4->screen->has_control_flow) {
1904
fprintf(stderr,
1905
"IF statement support requires updated kernel.\n");
1906
return;
1907
}
1908
1909
nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1910
bool empty_else_block =
1911
(nir_else_block == nir_if_last_else_block(if_stmt) &&
1912
exec_list_is_empty(&nir_else_block->instr_list));
1913
1914
struct qblock *then_block = qir_new_block(c);
1915
struct qblock *after_block = qir_new_block(c);
1916
struct qblock *else_block;
1917
if (empty_else_block)
1918
else_block = after_block;
1919
else
1920
else_block = qir_new_block(c);
1921
1922
bool was_top_level = false;
1923
if (c->execute.file == QFILE_NULL) {
1924
c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
1925
was_top_level = true;
1926
}
1927
1928
/* Set ZS for executing (execute == 0) and jumping (if->condition ==
1929
* 0) channels, and then update execute flags for those to point to
1930
* the ELSE block.
1931
*/
1932
qir_SF(c, qir_OR(c,
1933
c->execute,
1934
ntq_get_src(c, if_stmt->condition, 0)));
1935
qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1936
qir_uniform_ui(c, else_block->index));
1937
1938
/* Jump to ELSE if nothing is active for THEN, otherwise fall
1939
* through.
1940
*/
1941
qir_SF(c, c->execute);
1942
qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
1943
qir_link_blocks(c->cur_block, else_block);
1944
qir_link_blocks(c->cur_block, then_block);
1945
1946
/* Process the THEN block. */
1947
qir_set_emit_block(c, then_block);
1948
ntq_emit_cf_list(c, &if_stmt->then_list);
1949
1950
if (!empty_else_block) {
1951
/* Handle the end of the THEN block. First, all currently
1952
* active channels update their execute flags to point to
1953
* ENDIF
1954
*/
1955
qir_SF(c, c->execute);
1956
qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1957
qir_uniform_ui(c, after_block->index));
1958
1959
/* If everything points at ENDIF, then jump there immediately. */
1960
qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
1961
qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1962
qir_link_blocks(c->cur_block, after_block);
1963
qir_link_blocks(c->cur_block, else_block);
1964
1965
qir_set_emit_block(c, else_block);
1966
ntq_activate_execute_for_block(c);
1967
ntq_emit_cf_list(c, &if_stmt->else_list);
1968
}
1969
1970
qir_link_blocks(c->cur_block, after_block);
1971
1972
qir_set_emit_block(c, after_block);
1973
if (was_top_level) {
1974
c->execute = c->undef;
1975
c->last_top_block = c->cur_block;
1976
} else {
1977
ntq_activate_execute_for_block(c);
1978
}
1979
}
1980
1981
static void
1982
ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
1983
{
1984
struct qblock *jump_block;
1985
switch (jump->type) {
1986
case nir_jump_break:
1987
jump_block = c->loop_break_block;
1988
break;
1989
case nir_jump_continue:
1990
jump_block = c->loop_cont_block;
1991
break;
1992
default:
1993
unreachable("Unsupported jump type\n");
1994
}
1995
1996
qir_SF(c, c->execute);
1997
qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1998
qir_uniform_ui(c, jump_block->index));
1999
2000
/* Jump to the destination block if everyone has taken the jump. */
2001
qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
2002
qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
2003
struct qblock *new_block = qir_new_block(c);
2004
qir_link_blocks(c->cur_block, jump_block);
2005
qir_link_blocks(c->cur_block, new_block);
2006
qir_set_emit_block(c, new_block);
2007
}
2008
2009
static void
2010
ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
2011
{
2012
switch (instr->type) {
2013
case nir_instr_type_alu:
2014
ntq_emit_alu(c, nir_instr_as_alu(instr));
2015
break;
2016
2017
case nir_instr_type_intrinsic:
2018
ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2019
break;
2020
2021
case nir_instr_type_load_const:
2022
ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2023
break;
2024
2025
case nir_instr_type_ssa_undef:
2026
ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2027
break;
2028
2029
case nir_instr_type_tex:
2030
ntq_emit_tex(c, nir_instr_as_tex(instr));
2031
break;
2032
2033
case nir_instr_type_jump:
2034
ntq_emit_jump(c, nir_instr_as_jump(instr));
2035
break;
2036
2037
default:
2038
fprintf(stderr, "Unknown NIR instr type: ");
2039
nir_print_instr(instr, stderr);
2040
fprintf(stderr, "\n");
2041
abort();
2042
}
2043
}
2044
2045
static void
2046
ntq_emit_block(struct vc4_compile *c, nir_block *block)
2047
{
2048
nir_foreach_instr(instr, block) {
2049
ntq_emit_instr(c, instr);
2050
}
2051
}
2052
2053
static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
2054
2055
static void
2056
ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
2057
{
2058
if (!c->vc4->screen->has_control_flow) {
2059
fprintf(stderr,
2060
"loop support requires updated kernel.\n");
2061
ntq_emit_cf_list(c, &loop->body);
2062
return;
2063
}
2064
2065
bool was_top_level = false;
2066
if (c->execute.file == QFILE_NULL) {
2067
c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
2068
was_top_level = true;
2069
}
2070
2071
struct qblock *save_loop_cont_block = c->loop_cont_block;
2072
struct qblock *save_loop_break_block = c->loop_break_block;
2073
2074
c->loop_cont_block = qir_new_block(c);
2075
c->loop_break_block = qir_new_block(c);
2076
2077
qir_link_blocks(c->cur_block, c->loop_cont_block);
2078
qir_set_emit_block(c, c->loop_cont_block);
2079
ntq_activate_execute_for_block(c);
2080
2081
ntq_emit_cf_list(c, &loop->body);
2082
2083
/* If anything had explicitly continued, or is here at the end of the
2084
* loop, then we need to loop again. SF updates are masked by the
2085
* instruction's condition, so we can do the OR of the two conditions
2086
* within SF.
2087
*/
2088
qir_SF(c, c->execute);
2089
struct qinst *cont_check =
2090
qir_SUB_dest(c,
2091
c->undef,
2092
c->execute,
2093
qir_uniform_ui(c, c->loop_cont_block->index));
2094
cont_check->cond = QPU_COND_ZC;
2095
cont_check->sf = true;
2096
2097
qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
2098
qir_link_blocks(c->cur_block, c->loop_cont_block);
2099
qir_link_blocks(c->cur_block, c->loop_break_block);
2100
2101
qir_set_emit_block(c, c->loop_break_block);
2102
if (was_top_level) {
2103
c->execute = c->undef;
2104
c->last_top_block = c->cur_block;
2105
} else {
2106
ntq_activate_execute_for_block(c);
2107
}
2108
2109
c->loop_break_block = save_loop_break_block;
2110
c->loop_cont_block = save_loop_cont_block;
2111
}
2112
2113
static void
2114
ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
2115
{
2116
fprintf(stderr, "FUNCTIONS not handled.\n");
2117
abort();
2118
}
2119
2120
static void
2121
ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
2122
{
2123
foreach_list_typed(nir_cf_node, node, node, list) {
2124
switch (node->type) {
2125
case nir_cf_node_block:
2126
ntq_emit_block(c, nir_cf_node_as_block(node));
2127
break;
2128
2129
case nir_cf_node_if:
2130
ntq_emit_if(c, nir_cf_node_as_if(node));
2131
break;
2132
2133
case nir_cf_node_loop:
2134
ntq_emit_loop(c, nir_cf_node_as_loop(node));
2135
break;
2136
2137
case nir_cf_node_function:
2138
ntq_emit_function(c, nir_cf_node_as_function(node));
2139
break;
2140
2141
default:
2142
fprintf(stderr, "Unknown NIR node type\n");
2143
abort();
2144
}
2145
}
2146
}
2147
2148
static void
2149
ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
2150
{
2151
ntq_setup_registers(c, &impl->registers);
2152
ntq_emit_cf_list(c, &impl->body);
2153
}
2154
2155
static void
2156
nir_to_qir(struct vc4_compile *c)
2157
{
2158
if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
2159
c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
2160
2161
ntq_setup_inputs(c);
2162
ntq_setup_outputs(c);
2163
2164
/* Find the main function and emit the body. */
2165
nir_foreach_function(function, c->s) {
2166
assert(strcmp(function->name, "main") == 0);
2167
assert(function->impl);
2168
ntq_emit_impl(c, function->impl);
2169
}
2170
}
2171
2172
static const nir_shader_compiler_options nir_options = {
2173
.lower_all_io_to_temps = true,
2174
.lower_extract_byte = true,
2175
.lower_extract_word = true,
2176
.lower_insert_byte = true,
2177
.lower_insert_word = true,
2178
.lower_fdiv = true,
2179
.lower_ffma16 = true,
2180
.lower_ffma32 = true,
2181
.lower_ffma64 = true,
2182
.lower_flrp32 = true,
2183
.lower_fmod = true,
2184
.lower_fpow = true,
2185
.lower_fsat = true,
2186
.lower_fsqrt = true,
2187
.lower_ldexp = true,
2188
.lower_fneg = true,
2189
.lower_ineg = true,
2190
.lower_rotate = true,
2191
.lower_to_scalar = true,
2192
.lower_umax = true,
2193
.lower_umin = true,
2194
.lower_isign = true,
2195
.has_fsub = true,
2196
.has_isub = true,
2197
.max_unroll_iterations = 32,
2198
};
2199
2200
const void *
2201
vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
2202
enum pipe_shader_ir ir,
2203
enum pipe_shader_type shader)
2204
{
2205
return &nir_options;
2206
}
2207
2208
static int
2209
count_nir_instrs(nir_shader *nir)
2210
{
2211
int count = 0;
2212
nir_foreach_function(function, nir) {
2213
if (!function->impl)
2214
continue;
2215
nir_foreach_block(block, function->impl) {
2216
nir_foreach_instr(instr, block)
2217
count++;
2218
}
2219
}
2220
return count;
2221
}
2222
2223
static struct vc4_compile *
2224
vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2225
struct vc4_key *key, bool fs_threaded)
2226
{
2227
struct vc4_compile *c = qir_compile_init();
2228
2229
c->vc4 = vc4;
2230
c->stage = stage;
2231
c->shader_state = &key->shader_state->base;
2232
c->program_id = key->shader_state->program_id;
2233
c->variant_id =
2234
p_atomic_inc_return(&key->shader_state->compiled_variant_count);
2235
c->fs_threaded = fs_threaded;
2236
2237
c->key = key;
2238
switch (stage) {
2239
case QSTAGE_FRAG:
2240
c->fs_key = (struct vc4_fs_key *)key;
2241
if (c->fs_key->is_points) {
2242
c->point_x = emit_fragment_varying(c, ~0, 0);
2243
c->point_y = emit_fragment_varying(c, ~0, 0);
2244
} else if (c->fs_key->is_lines) {
2245
c->line_x = emit_fragment_varying(c, ~0, 0);
2246
}
2247
break;
2248
case QSTAGE_VERT:
2249
c->vs_key = (struct vc4_vs_key *)key;
2250
break;
2251
case QSTAGE_COORD:
2252
c->vs_key = (struct vc4_vs_key *)key;
2253
break;
2254
}
2255
2256
c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
2257
2258
if (stage == QSTAGE_FRAG) {
2259
NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
2260
}
2261
2262
struct nir_lower_tex_options tex_options = {
2263
.lower_txp = ~0,
2264
2265
/* Apply swizzles to all samplers. */
2266
.swizzle_result = ~0,
2267
};
2268
2269
/* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
2270
* The format swizzling applies before sRGB decode, and
2271
* ARB_texture_swizzle is the last thing before returning the sample.
2272
*/
2273
for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
2274
enum pipe_format format = c->key->tex[i].format;
2275
2276
if (!format)
2277
continue;
2278
2279
const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
2280
2281
for (int j = 0; j < 4; j++) {
2282
uint8_t arb_swiz = c->key->tex[i].swizzle[j];
2283
2284
if (arb_swiz <= 3) {
2285
tex_options.swizzles[i][j] =
2286
format_swizzle[arb_swiz];
2287
} else {
2288
tex_options.swizzles[i][j] = arb_swiz;
2289
}
2290
}
2291
2292
if (util_format_is_srgb(format))
2293
tex_options.lower_srgb |= (1 << i);
2294
}
2295
2296
NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
2297
2298
if (c->key->ucp_enables) {
2299
if (stage == QSTAGE_FRAG) {
2300
NIR_PASS_V(c->s, nir_lower_clip_fs,
2301
c->key->ucp_enables, false);
2302
} else {
2303
NIR_PASS_V(c->s, nir_lower_clip_vs,
2304
c->key->ucp_enables, false, false, NULL);
2305
NIR_PASS_V(c->s, nir_lower_io_to_scalar,
2306
nir_var_shader_out);
2307
}
2308
}
2309
2310
/* FS input scalarizing must happen after nir_lower_two_sided_color,
2311
* which only handles a vec4 at a time. Similarly, VS output
2312
* scalarizing must happen after nir_lower_clip_vs.
2313
*/
2314
if (c->stage == QSTAGE_FRAG)
2315
NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
2316
else
2317
NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
2318
2319
NIR_PASS_V(c->s, vc4_nir_lower_io, c);
2320
NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
2321
nir_lower_idiv_options idiv_options = {
2322
.imprecise_32bit_lowering = true,
2323
.allow_fp16 = true,
2324
};
2325
NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
2326
2327
vc4_optimize_nir(c->s);
2328
2329
/* Do late algebraic optimization to turn add(a, neg(b)) back into
2330
* subs, then the mandatory cleanup after algebraic. Note that it may
2331
* produce fnegs, and if so then we need to keep running to squash
2332
* fneg(fneg(a)).
2333
*/
2334
bool more_late_algebraic = true;
2335
while (more_late_algebraic) {
2336
more_late_algebraic = false;
2337
NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
2338
NIR_PASS_V(c->s, nir_opt_constant_folding);
2339
NIR_PASS_V(c->s, nir_copy_prop);
2340
NIR_PASS_V(c->s, nir_opt_dce);
2341
NIR_PASS_V(c->s, nir_opt_cse);
2342
}
2343
2344
NIR_PASS_V(c->s, nir_lower_bool_to_int32);
2345
2346
NIR_PASS_V(c->s, nir_convert_from_ssa, true);
2347
2348
if (vc4_debug & VC4_DEBUG_SHADERDB) {
2349
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
2350
qir_get_stage_name(c->stage),
2351
c->program_id, c->variant_id,
2352
count_nir_instrs(c->s));
2353
}
2354
2355
if (vc4_debug & VC4_DEBUG_NIR) {
2356
fprintf(stderr, "%s prog %d/%d NIR:\n",
2357
qir_get_stage_name(c->stage),
2358
c->program_id, c->variant_id);
2359
nir_print_shader(c->s, stderr);
2360
}
2361
2362
nir_to_qir(c);
2363
2364
switch (stage) {
2365
case QSTAGE_FRAG:
2366
/* FS threading requires that the thread execute
2367
* QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
2368
* (with no other THRSW afterwards, obviously). If we didn't
2369
* fetch a texture at a top level block, this wouldn't be
2370
* true.
2371
*/
2372
if (c->fs_threaded && !c->last_thrsw_at_top_level) {
2373
c->failed = true;
2374
return c;
2375
}
2376
2377
emit_frag_end(c);
2378
break;
2379
case QSTAGE_VERT:
2380
emit_vert_end(c,
2381
c->vs_key->fs_inputs->input_slots,
2382
c->vs_key->fs_inputs->num_inputs);
2383
break;
2384
case QSTAGE_COORD:
2385
emit_coord_end(c);
2386
break;
2387
}
2388
2389
if (vc4_debug & VC4_DEBUG_QIR) {
2390
fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2391
qir_get_stage_name(c->stage),
2392
c->program_id, c->variant_id);
2393
qir_dump(c);
2394
fprintf(stderr, "\n");
2395
}
2396
2397
qir_optimize(c);
2398
qir_lower_uniforms(c);
2399
2400
qir_schedule_instructions(c);
2401
qir_emit_uniform_stream_resets(c);
2402
2403
if (vc4_debug & VC4_DEBUG_QIR) {
2404
fprintf(stderr, "%s prog %d/%d QIR:\n",
2405
qir_get_stage_name(c->stage),
2406
c->program_id, c->variant_id);
2407
qir_dump(c);
2408
fprintf(stderr, "\n");
2409
}
2410
2411
qir_reorder_uniforms(c);
2412
vc4_generate_code(vc4, c);
2413
2414
if (vc4_debug & VC4_DEBUG_SHADERDB) {
2415
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
2416
qir_get_stage_name(c->stage),
2417
c->program_id, c->variant_id,
2418
c->qpu_inst_count);
2419
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n",
2420
qir_get_stage_name(c->stage),
2421
c->program_id, c->variant_id,
2422
c->num_uniforms);
2423
}
2424
2425
ralloc_free(c->s);
2426
2427
return c;
2428
}
2429
2430
static void *
2431
vc4_shader_state_create(struct pipe_context *pctx,
2432
const struct pipe_shader_state *cso)
2433
{
2434
struct vc4_context *vc4 = vc4_context(pctx);
2435
struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2436
if (!so)
2437
return NULL;
2438
2439
so->program_id = vc4->next_uncompiled_program_id++;
2440
2441
nir_shader *s;
2442
2443
if (cso->type == PIPE_SHADER_IR_NIR) {
2444
/* The backend takes ownership of the NIR shader on state
2445
* creation.
2446
*/
2447
s = cso->ir.nir;
2448
} else {
2449
assert(cso->type == PIPE_SHADER_IR_TGSI);
2450
2451
if (vc4_debug & VC4_DEBUG_TGSI) {
2452
fprintf(stderr, "prog %d TGSI:\n",
2453
so->program_id);
2454
tgsi_dump(cso->tokens, 0);
2455
fprintf(stderr, "\n");
2456
}
2457
s = tgsi_to_nir(cso->tokens, pctx->screen, false);
2458
}
2459
2460
if (s->info.stage == MESA_SHADER_VERTEX)
2461
NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);
2462
2463
NIR_PASS_V(s, nir_lower_io,
2464
nir_var_shader_in | nir_var_shader_out | nir_var_uniform,
2465
type_size, (nir_lower_io_options)0);
2466
2467
NIR_PASS_V(s, nir_lower_regs_to_ssa);
2468
NIR_PASS_V(s, nir_normalize_cubemap_coords);
2469
2470
NIR_PASS_V(s, nir_lower_load_const_to_scalar);
2471
2472
vc4_optimize_nir(s);
2473
2474
NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
2475
2476
/* Garbage collect dead instructions */
2477
nir_sweep(s);
2478
2479
so->base.type = PIPE_SHADER_IR_NIR;
2480
so->base.ir.nir = s;
2481
2482
if (vc4_debug & VC4_DEBUG_NIR) {
2483
fprintf(stderr, "%s prog %d NIR:\n",
2484
gl_shader_stage_name(s->info.stage),
2485
so->program_id);
2486
nir_print_shader(s, stderr);
2487
fprintf(stderr, "\n");
2488
}
2489
2490
return so;
2491
}
2492
2493
static void
2494
copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2495
struct vc4_compile *c)
2496
{
2497
int count = c->num_uniforms;
2498
struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2499
2500
uinfo->count = count;
2501
uinfo->data = ralloc_array(shader, uint32_t, count);
2502
memcpy(uinfo->data, c->uniform_data,
2503
count * sizeof(*uinfo->data));
2504
uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2505
memcpy(uinfo->contents, c->uniform_contents,
2506
count * sizeof(*uinfo->contents));
2507
uinfo->num_texture_samples = c->num_texture_samples;
2508
2509
vc4_set_shader_uniform_dirty_flags(shader);
2510
}
2511
2512
static void
2513
vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
2514
struct vc4_compiled_shader *shader)
2515
{
2516
struct vc4_fs_inputs inputs;
2517
2518
memset(&inputs, 0, sizeof(inputs));
2519
inputs.input_slots = ralloc_array(shader,
2520
struct vc4_varying_slot,
2521
c->num_input_slots);
2522
2523
bool input_live[c->num_input_slots];
2524
2525
memset(input_live, 0, sizeof(input_live));
2526
qir_for_each_inst_inorder(inst, c) {
2527
for (int i = 0; i < qir_get_nsrc(inst); i++) {
2528
if (inst->src[i].file == QFILE_VARY)
2529
input_live[inst->src[i].index] = true;
2530
}
2531
}
2532
2533
for (int i = 0; i < c->num_input_slots; i++) {
2534
struct vc4_varying_slot *slot = &c->input_slots[i];
2535
2536
if (!input_live[i])
2537
continue;
2538
2539
/* Skip non-VS-output inputs. */
2540
if (slot->slot == (uint8_t)~0)
2541
continue;
2542
2543
if (slot->slot == VARYING_SLOT_COL0 ||
2544
slot->slot == VARYING_SLOT_COL1 ||
2545
slot->slot == VARYING_SLOT_BFC0 ||
2546
slot->slot == VARYING_SLOT_BFC1) {
2547
shader->color_inputs |= (1 << inputs.num_inputs);
2548
}
2549
2550
inputs.input_slots[inputs.num_inputs] = *slot;
2551
inputs.num_inputs++;
2552
}
2553
shader->num_inputs = inputs.num_inputs;
2554
2555
/* Add our set of inputs to the set of all inputs seen. This way, we
2556
* can have a single pointer that identifies an FS inputs set,
2557
* allowing VS to avoid recompiling when the FS is recompiled (or a
2558
* new one is bound using separate shader objects) but the inputs
2559
* don't change.
2560
*/
2561
struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
2562
if (entry) {
2563
shader->fs_inputs = entry->key;
2564
ralloc_free(inputs.input_slots);
2565
} else {
2566
struct vc4_fs_inputs *alloc_inputs;
2567
2568
alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
2569
memcpy(alloc_inputs, &inputs, sizeof(inputs));
2570
ralloc_steal(alloc_inputs, inputs.input_slots);
2571
_mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
2572
2573
shader->fs_inputs = alloc_inputs;
2574
}
2575
}
2576
2577
static struct vc4_compiled_shader *
2578
vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2579
struct vc4_key *key)
2580
{
2581
struct hash_table *ht;
2582
uint32_t key_size;
2583
bool try_threading;
2584
2585
if (stage == QSTAGE_FRAG) {
2586
ht = vc4->fs_cache;
2587
key_size = sizeof(struct vc4_fs_key);
2588
try_threading = vc4->screen->has_threaded_fs;
2589
} else {
2590
ht = vc4->vs_cache;
2591
key_size = sizeof(struct vc4_vs_key);
2592
try_threading = false;
2593
}
2594
2595
struct vc4_compiled_shader *shader;
2596
struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2597
if (entry)
2598
return entry->data;
2599
2600
struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
2601
/* If the FS failed to compile threaded, fall back to single threaded. */
2602
if (try_threading && c->failed) {
2603
qir_compile_destroy(c);
2604
c = vc4_shader_ntq(vc4, stage, key, false);
2605
}
2606
2607
shader = rzalloc(NULL, struct vc4_compiled_shader);
2608
2609
shader->program_id = vc4->next_compiled_program_id++;
2610
if (stage == QSTAGE_FRAG) {
2611
vc4_setup_compiled_fs_inputs(vc4, c, shader);
2612
2613
/* Note: the temporary clone in c->s has been freed. */
2614
nir_shader *orig_shader = key->shader_state->base.ir.nir;
2615
if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
2616
shader->disable_early_z = true;
2617
} else {
2618
shader->num_inputs = c->num_inputs;
2619
2620
shader->vattr_offsets[0] = 0;
2621
for (int i = 0; i < 8; i++) {
2622
shader->vattr_offsets[i + 1] =
2623
shader->vattr_offsets[i] + c->vattr_sizes[i];
2624
2625
if (c->vattr_sizes[i])
2626
shader->vattrs_live |= (1 << i);
2627
}
2628
}
2629
2630
shader->failed = c->failed;
2631
if (c->failed) {
2632
shader->failed = true;
2633
} else {
2634
copy_uniform_state_to_shader(shader, c);
2635
shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
2636
c->qpu_inst_count *
2637
sizeof(uint64_t));
2638
}
2639
2640
shader->fs_threaded = c->fs_threaded;
2641
2642
if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {
2643
fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",
2644
qir_get_stage_name(c->stage),
2645
c->program_id, c->variant_id,
2646
1 + shader->fs_threaded);
2647
}
2648
2649
qir_compile_destroy(c);
2650
2651
struct vc4_key *dup_key;
2652
dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
2653
memcpy(dup_key, key, key_size);
2654
_mesa_hash_table_insert(ht, dup_key, shader);
2655
2656
return shader;
2657
}
2658
2659
static void
2660
vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2661
struct vc4_texture_stateobj *texstate)
2662
{
2663
for (int i = 0; i < texstate->num_textures; i++) {
2664
struct pipe_sampler_view *sampler = texstate->textures[i];
2665
struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
2666
struct pipe_sampler_state *sampler_state =
2667
texstate->samplers[i];
2668
2669
if (!sampler)
2670
continue;
2671
2672
key->tex[i].format = sampler->format;
2673
key->tex[i].swizzle[0] = sampler->swizzle_r;
2674
key->tex[i].swizzle[1] = sampler->swizzle_g;
2675
key->tex[i].swizzle[2] = sampler->swizzle_b;
2676
key->tex[i].swizzle[3] = sampler->swizzle_a;
2677
2678
if (sampler->texture->nr_samples > 1) {
2679
key->tex[i].msaa_width = sampler->texture->width0;
2680
key->tex[i].msaa_height = sampler->texture->height0;
2681
} else if (sampler){
2682
key->tex[i].compare_mode = sampler_state->compare_mode;
2683
key->tex[i].compare_func = sampler_state->compare_func;
2684
key->tex[i].wrap_s = sampler_state->wrap_s;
2685
key->tex[i].wrap_t = sampler_state->wrap_t;
2686
key->tex[i].force_first_level =
2687
vc4_sampler->force_first_level;
2688
}
2689
}
2690
2691
key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2692
}
2693
2694
static void
2695
vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2696
{
2697
struct vc4_job *job = vc4->job;
2698
struct vc4_fs_key local_key;
2699
struct vc4_fs_key *key = &local_key;
2700
2701
if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2702
VC4_DIRTY_BLEND |
2703
VC4_DIRTY_FRAMEBUFFER |
2704
VC4_DIRTY_ZSA |
2705
VC4_DIRTY_RASTERIZER |
2706
VC4_DIRTY_SAMPLE_MASK |
2707
VC4_DIRTY_FRAGTEX |
2708
VC4_DIRTY_UNCOMPILED_FS |
2709
VC4_DIRTY_UBO_1_SIZE))) {
2710
return;
2711
}
2712
2713
memset(key, 0, sizeof(*key));
2714
vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2715
key->base.shader_state = vc4->prog.bind_fs;
2716
key->is_points = (prim_mode == PIPE_PRIM_POINTS);
2717
key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
2718
prim_mode <= PIPE_PRIM_LINE_STRIP);
2719
key->blend = vc4->blend->rt[0];
2720
if (vc4->blend->logicop_enable) {
2721
key->logicop_func = vc4->blend->logicop_func;
2722
} else {
2723
key->logicop_func = PIPE_LOGICOP_COPY;
2724
}
2725
if (job->msaa) {
2726
key->msaa = vc4->rasterizer->base.multisample;
2727
key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
2728
key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
2729
key->sample_alpha_to_one = vc4->blend->alpha_to_one;
2730
}
2731
2732
if (vc4->framebuffer.cbufs[0])
2733
key->color_format = vc4->framebuffer.cbufs[0]->format;
2734
2735
key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2736
key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2737
key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2738
key->depth_enabled = (vc4->zsa->base.depth_enabled ||
2739
key->stencil_enabled);
2740
2741
if (key->is_points) {
2742
key->point_sprite_mask =
2743
vc4->rasterizer->base.sprite_coord_enable;
2744
key->point_coord_upper_left =
2745
(vc4->rasterizer->base.sprite_coord_mode ==
2746
PIPE_SPRITE_COORD_UPPER_LEFT);
2747
}
2748
2749
key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
2750
2751
struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2752
vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2753
if (vc4->prog.fs == old_fs)
2754
return;
2755
2756
vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2757
2758
if (vc4->rasterizer->base.flatshade &&
2759
(!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
2760
vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2761
}
2762
2763
if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
2764
vc4->dirty |= VC4_DIRTY_FS_INPUTS;
2765
}
2766
2767
static void
2768
vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2769
{
2770
struct vc4_vs_key local_key;
2771
struct vc4_vs_key *key = &local_key;
2772
2773
if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2774
VC4_DIRTY_RASTERIZER |
2775
VC4_DIRTY_VERTTEX |
2776
VC4_DIRTY_VTXSTATE |
2777
VC4_DIRTY_UNCOMPILED_VS |
2778
VC4_DIRTY_FS_INPUTS))) {
2779
return;
2780
}
2781
2782
memset(key, 0, sizeof(*key));
2783
vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2784
key->base.shader_state = vc4->prog.bind_vs;
2785
key->fs_inputs = vc4->prog.fs->fs_inputs;
2786
2787
for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2788
key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2789
2790
key->per_vertex_point_size =
2791
(prim_mode == PIPE_PRIM_POINTS &&
2792
vc4->rasterizer->base.point_size_per_vertex);
2793
2794
struct vc4_compiled_shader *vs =
2795
vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2796
if (vs != vc4->prog.vs) {
2797
vc4->prog.vs = vs;
2798
vc4->dirty |= VC4_DIRTY_COMPILED_VS;
2799
}
2800
2801
key->is_coord = true;
2802
/* Coord shaders don't care what the FS inputs are. */
2803
key->fs_inputs = NULL;
2804
struct vc4_compiled_shader *cs =
2805
vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2806
if (cs != vc4->prog.cs) {
2807
vc4->prog.cs = cs;
2808
vc4->dirty |= VC4_DIRTY_COMPILED_CS;
2809
}
2810
}
2811
2812
bool
2813
vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2814
{
2815
vc4_update_compiled_fs(vc4, prim_mode);
2816
vc4_update_compiled_vs(vc4, prim_mode);
2817
2818
return !(vc4->prog.cs->failed ||
2819
vc4->prog.vs->failed ||
2820
vc4->prog.fs->failed);
2821
}
2822
2823
static uint32_t
2824
fs_cache_hash(const void *key)
2825
{
2826
return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2827
}
2828
2829
static uint32_t
2830
vs_cache_hash(const void *key)
2831
{
2832
return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2833
}
2834
2835
static bool
2836
fs_cache_compare(const void *key1, const void *key2)
2837
{
2838
return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2839
}
2840
2841
static bool
2842
vs_cache_compare(const void *key1, const void *key2)
2843
{
2844
return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2845
}
2846
2847
static uint32_t
2848
fs_inputs_hash(const void *key)
2849
{
2850
const struct vc4_fs_inputs *inputs = key;
2851
2852
return _mesa_hash_data(inputs->input_slots,
2853
sizeof(*inputs->input_slots) *
2854
inputs->num_inputs);
2855
}
2856
2857
static bool
2858
fs_inputs_compare(const void *key1, const void *key2)
2859
{
2860
const struct vc4_fs_inputs *inputs1 = key1;
2861
const struct vc4_fs_inputs *inputs2 = key2;
2862
2863
return (inputs1->num_inputs == inputs2->num_inputs &&
2864
memcmp(inputs1->input_slots,
2865
inputs2->input_slots,
2866
sizeof(*inputs1->input_slots) *
2867
inputs1->num_inputs) == 0);
2868
}
2869
2870
static void
2871
delete_from_cache_if_matches(struct hash_table *ht,
2872
struct vc4_compiled_shader **last_compile,
2873
struct hash_entry *entry,
2874
struct vc4_uncompiled_shader *so)
2875
{
2876
const struct vc4_key *key = entry->key;
2877
2878
if (key->shader_state == so) {
2879
struct vc4_compiled_shader *shader = entry->data;
2880
_mesa_hash_table_remove(ht, entry);
2881
vc4_bo_unreference(&shader->bo);
2882
2883
if (shader == *last_compile)
2884
*last_compile = NULL;
2885
2886
ralloc_free(shader);
2887
}
2888
}
2889
2890
static void
2891
vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2892
{
2893
struct vc4_context *vc4 = vc4_context(pctx);
2894
struct vc4_uncompiled_shader *so = hwcso;
2895
2896
hash_table_foreach(vc4->fs_cache, entry) {
2897
delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
2898
entry, so);
2899
}
2900
hash_table_foreach(vc4->vs_cache, entry) {
2901
delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
2902
entry, so);
2903
}
2904
2905
ralloc_free(so->base.ir.nir);
2906
free(so);
2907
}
2908
2909
static void
2910
vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2911
{
2912
struct vc4_context *vc4 = vc4_context(pctx);
2913
vc4->prog.bind_fs = hwcso;
2914
vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2915
}
2916
2917
static void
2918
vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2919
{
2920
struct vc4_context *vc4 = vc4_context(pctx);
2921
vc4->prog.bind_vs = hwcso;
2922
vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
2923
}
2924
2925
void
2926
vc4_program_init(struct pipe_context *pctx)
2927
{
2928
struct vc4_context *vc4 = vc4_context(pctx);
2929
2930
pctx->create_vs_state = vc4_shader_state_create;
2931
pctx->delete_vs_state = vc4_shader_state_delete;
2932
2933
pctx->create_fs_state = vc4_shader_state_create;
2934
pctx->delete_fs_state = vc4_shader_state_delete;
2935
2936
pctx->bind_fs_state = vc4_fp_state_bind;
2937
pctx->bind_vs_state = vc4_vp_state_bind;
2938
2939
vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
2940
fs_cache_compare);
2941
vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
2942
vs_cache_compare);
2943
vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
2944
fs_inputs_compare);
2945
}
2946
2947
void
2948
vc4_program_fini(struct pipe_context *pctx)
2949
{
2950
struct vc4_context *vc4 = vc4_context(pctx);
2951
2952
hash_table_foreach(vc4->fs_cache, entry) {
2953
struct vc4_compiled_shader *shader = entry->data;
2954
vc4_bo_unreference(&shader->bo);
2955
ralloc_free(shader);
2956
_mesa_hash_table_remove(vc4->fs_cache, entry);
2957
}
2958
2959
hash_table_foreach(vc4->vs_cache, entry) {
2960
struct vc4_compiled_shader *shader = entry->data;
2961
vc4_bo_unreference(&shader->bo);
2962
ralloc_free(shader);
2963
_mesa_hash_table_remove(vc4->vs_cache, entry);
2964
}
2965
}
2966
2967