Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
4574 views
1
/*
2
* Copyright (C) 2018 Jonathan Marek <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors:
24
* Jonathan Marek <[email protected]>
25
*/
26
27
#include "ir2_private.h"
28
29
#include "fd2_program.h"
30
#include "freedreno_util.h"
31
32
static const nir_shader_compiler_options options = {
33
.lower_fpow = true,
34
.lower_flrp32 = true,
35
.lower_fmod = true,
36
.lower_fdiv = true,
37
.lower_fceil = true,
38
.fuse_ffma16 = true,
39
.fuse_ffma32 = true,
40
.fuse_ffma64 = true,
41
/* .fdot_replicates = true, it is replicated, but it makes things worse */
42
.lower_all_io_to_temps = true,
43
.vertex_id_zero_based = true, /* its not implemented anyway */
44
.lower_bitops = true,
45
.lower_rotate = true,
46
.lower_vector_cmp = true,
47
.lower_fdph = true,
48
.has_fsub = true,
49
.has_isub = true,
50
.lower_insert_byte = true,
51
.lower_insert_word = true,
52
};
53
54
const nir_shader_compiler_options *
55
ir2_get_compiler_options(void)
56
{
57
return &options;
58
}
59
60
#define OPT(nir, pass, ...) \
61
({ \
62
bool this_progress = false; \
63
NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
64
this_progress; \
65
})
66
#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
67
68
static void
69
ir2_optimize_loop(nir_shader *s)
70
{
71
bool progress;
72
do {
73
progress = false;
74
75
OPT_V(s, nir_lower_vars_to_ssa);
76
progress |= OPT(s, nir_opt_copy_prop_vars);
77
progress |= OPT(s, nir_copy_prop);
78
progress |= OPT(s, nir_opt_dce);
79
progress |= OPT(s, nir_opt_cse);
80
/* progress |= OPT(s, nir_opt_gcm, true); */
81
progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
82
progress |= OPT(s, nir_opt_intrinsics);
83
progress |= OPT(s, nir_opt_algebraic);
84
progress |= OPT(s, nir_opt_constant_folding);
85
progress |= OPT(s, nir_opt_dead_cf);
86
if (OPT(s, nir_opt_trivial_continues)) {
87
progress |= true;
88
/* If nir_opt_trivial_continues makes progress, then we need to clean
89
* things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
90
* to make progress.
91
*/
92
OPT(s, nir_copy_prop);
93
OPT(s, nir_opt_dce);
94
}
95
progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
96
progress |= OPT(s, nir_opt_if, false);
97
progress |= OPT(s, nir_opt_remove_phis);
98
progress |= OPT(s, nir_opt_undef);
99
100
} while (progress);
101
}
102
103
/* trig workarounds is the same as ir3.. but we don't want to include ir3 */
104
bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
105
106
int
107
ir2_optimize_nir(nir_shader *s, bool lower)
108
{
109
struct nir_lower_tex_options tex_options = {
110
.lower_txp = ~0u,
111
.lower_rect = 0,
112
};
113
114
if (FD_DBG(DISASM)) {
115
debug_printf("----------------------\n");
116
nir_print_shader(s, stdout);
117
debug_printf("----------------------\n");
118
}
119
120
OPT_V(s, nir_lower_regs_to_ssa);
121
OPT_V(s, nir_lower_vars_to_ssa);
122
OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out,
123
UINT32_MAX);
124
125
if (lower) {
126
OPT_V(s, ir3_nir_apply_trig_workarounds);
127
OPT_V(s, nir_lower_tex, &tex_options);
128
}
129
130
ir2_optimize_loop(s);
131
132
OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
133
OPT_V(s, nir_opt_sink, nir_move_const_undef);
134
135
/* TODO we dont want to get shaders writing to depth for depth textures */
136
if (s->info.stage == MESA_SHADER_FRAGMENT) {
137
nir_foreach_shader_out_variable (var, s) {
138
if (var->data.location == FRAG_RESULT_DEPTH)
139
return -1;
140
}
141
}
142
143
return 0;
144
}
145
146
static struct ir2_src
147
load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
148
{
149
struct fd2_shader_stateobj *so = ctx->so;
150
unsigned imm_ncomp, swiz, idx, i, j;
151
uint32_t *value = (uint32_t *)value_f;
152
153
/* try to merge with existing immediate (TODO: try with neg) */
154
for (idx = 0; idx < so->num_immediates; idx++) {
155
swiz = 0;
156
imm_ncomp = so->immediates[idx].ncomp;
157
for (i = 0; i < ncomp; i++) {
158
for (j = 0; j < imm_ncomp; j++) {
159
if (value[i] == so->immediates[idx].val[j])
160
break;
161
}
162
if (j == imm_ncomp) {
163
if (j == 4)
164
break;
165
so->immediates[idx].val[imm_ncomp++] = value[i];
166
}
167
swiz |= swiz_set(j, i);
168
}
169
/* matched all components */
170
if (i == ncomp)
171
break;
172
}
173
174
/* need to allocate new immediate */
175
if (idx == so->num_immediates) {
176
swiz = 0;
177
imm_ncomp = 0;
178
for (i = 0; i < ncomp; i++) {
179
for (j = 0; j < imm_ncomp; j++) {
180
if (value[i] == ctx->so->immediates[idx].val[j])
181
break;
182
}
183
if (j == imm_ncomp) {
184
so->immediates[idx].val[imm_ncomp++] = value[i];
185
}
186
swiz |= swiz_set(j, i);
187
}
188
so->num_immediates++;
189
}
190
so->immediates[idx].ncomp = imm_ncomp;
191
192
if (ncomp == 1)
193
swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
194
195
return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
196
}
197
198
struct ir2_src
199
ir2_zero(struct ir2_context *ctx)
200
{
201
return load_const(ctx, (float[]){0.0f}, 1);
202
}
203
204
static void
205
update_range(struct ir2_context *ctx, struct ir2_reg *reg)
206
{
207
if (!reg->initialized) {
208
reg->initialized = true;
209
reg->loop_depth = ctx->loop_depth;
210
}
211
212
if (ctx->loop_depth > reg->loop_depth) {
213
reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
214
} else {
215
reg->loop_depth = ctx->loop_depth;
216
reg->block_idx_free = -1;
217
}
218
219
/* for regs we want to free at the end of the loop in any case
220
* XXX dont do this for ssa
221
*/
222
if (reg->loop_depth)
223
reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
224
}
225
226
static struct ir2_src
227
make_src(struct ir2_context *ctx, nir_src src)
228
{
229
struct ir2_src res = {};
230
struct ir2_reg *reg;
231
232
nir_const_value *const_value = nir_src_as_const_value(src);
233
234
if (const_value) {
235
assert(src.is_ssa);
236
float c[src.ssa->num_components];
237
nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
238
return load_const(ctx, c, src.ssa->num_components);
239
}
240
241
if (!src.is_ssa) {
242
res.num = src.reg.reg->index;
243
res.type = IR2_SRC_REG;
244
reg = &ctx->reg[res.num];
245
} else {
246
assert(ctx->ssa_map[src.ssa->index] >= 0);
247
res.num = ctx->ssa_map[src.ssa->index];
248
res.type = IR2_SRC_SSA;
249
reg = &ctx->instr[res.num].ssa;
250
}
251
252
update_range(ctx, reg);
253
return res;
254
}
255
256
static void
257
set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr)
258
{
259
struct ir2_reg *reg = &instr->ssa;
260
261
if (dst->is_ssa) {
262
ctx->ssa_map[dst->ssa.index] = instr->idx;
263
} else {
264
assert(instr->is_ssa);
265
reg = &ctx->reg[dst->reg.reg->index];
266
267
instr->is_ssa = false;
268
instr->reg = reg;
269
}
270
update_range(ctx, reg);
271
}
272
273
static struct ir2_instr *
274
ir2_instr_create(struct ir2_context *ctx, int type)
275
{
276
struct ir2_instr *instr;
277
278
instr = &ctx->instr[ctx->instr_count++];
279
instr->idx = ctx->instr_count - 1;
280
instr->type = type;
281
instr->block_idx = ctx->block_idx;
282
instr->pred = ctx->pred;
283
instr->is_ssa = true;
284
return instr;
285
}
286
287
static struct ir2_instr *
288
instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
289
{
290
/* emit_alu will fixup instrs that don't map directly */
291
static const struct ir2_opc {
292
int8_t scalar, vector;
293
} nir_ir2_opc[nir_num_opcodes + 1] = {
294
[0 ... nir_num_opcodes - 1] = {-1, -1},
295
296
[nir_op_mov] = {MAXs, MAXv},
297
[nir_op_fneg] = {MAXs, MAXv},
298
[nir_op_fabs] = {MAXs, MAXv},
299
[nir_op_fsat] = {MAXs, MAXv},
300
[nir_op_fsign] = {-1, CNDGTEv},
301
[nir_op_fadd] = {ADDs, ADDv},
302
[nir_op_fsub] = {ADDs, ADDv},
303
[nir_op_fmul] = {MULs, MULv},
304
[nir_op_ffma] = {-1, MULADDv},
305
[nir_op_fmax] = {MAXs, MAXv},
306
[nir_op_fmin] = {MINs, MINv},
307
[nir_op_ffloor] = {FLOORs, FLOORv},
308
[nir_op_ffract] = {FRACs, FRACv},
309
[nir_op_ftrunc] = {TRUNCs, TRUNCv},
310
[nir_op_fdot2] = {-1, DOT2ADDv},
311
[nir_op_fdot3] = {-1, DOT3v},
312
[nir_op_fdot4] = {-1, DOT4v},
313
[nir_op_sge] = {-1, SETGTEv},
314
[nir_op_slt] = {-1, SETGTv},
315
[nir_op_sne] = {-1, SETNEv},
316
[nir_op_seq] = {-1, SETEv},
317
[nir_op_fcsel] = {-1, CNDEv},
318
[nir_op_frsq] = {RECIPSQ_IEEE, -1},
319
[nir_op_frcp] = {RECIP_IEEE, -1},
320
[nir_op_flog2] = {LOG_IEEE, -1},
321
[nir_op_fexp2] = {EXP_IEEE, -1},
322
[nir_op_fsqrt] = {SQRT_IEEE, -1},
323
[nir_op_fcos] = {COS, -1},
324
[nir_op_fsin] = {SIN, -1},
325
/* no fsat, fneg, fabs since source mods deal with those */
326
327
/* so we can use this function with non-nir op */
328
#define ir2_op_cube nir_num_opcodes
329
[ir2_op_cube] = {-1, CUBEv},
330
};
331
332
struct ir2_opc op = nir_ir2_opc[opcode];
333
assert(op.vector >= 0 || op.scalar >= 0);
334
335
struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
336
instr->alu.vector_opc = op.vector;
337
instr->alu.scalar_opc = op.scalar;
338
instr->alu.export = -1;
339
instr->alu.write_mask = (1 << ncomp) - 1;
340
instr->src_count =
341
opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs;
342
instr->ssa.ncomp = ncomp;
343
return instr;
344
}
345
346
static struct ir2_instr *
347
instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask,
348
struct ir2_instr *share_reg)
349
{
350
struct ir2_instr *instr;
351
struct ir2_reg *reg;
352
353
reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
354
reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
355
356
instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
357
instr->alu.write_mask = write_mask;
358
instr->reg = reg;
359
instr->is_ssa = false;
360
return instr;
361
}
362
363
static struct ir2_instr *
364
instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
365
{
366
struct ir2_instr *instr;
367
instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
368
set_index(ctx, dst, instr);
369
return instr;
370
}
371
372
static struct ir2_instr *
373
ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
374
instr_fetch_opc_t opc)
375
{
376
struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
377
instr->fetch.opc = opc;
378
instr->src_count = 1;
379
instr->ssa.ncomp = nir_dest_num_components(*dst);
380
set_index(ctx, dst, instr);
381
return instr;
382
}
383
384
static struct ir2_src
385
make_src_noconst(struct ir2_context *ctx, nir_src src)
386
{
387
struct ir2_instr *instr;
388
389
if (nir_src_as_const_value(src)) {
390
assert(src.is_ssa);
391
instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
392
instr->src[0] = make_src(ctx, src);
393
return ir2_src(instr->idx, 0, IR2_SRC_SSA);
394
}
395
396
return make_src(ctx, src);
397
}
398
399
static void
400
emit_alu(struct ir2_context *ctx, nir_alu_instr *alu)
401
{
402
const nir_op_info *info = &nir_op_infos[alu->op];
403
nir_dest *dst = &alu->dest.dest;
404
struct ir2_instr *instr;
405
struct ir2_src tmp;
406
unsigned ncomp;
407
408
/* get the number of dst components */
409
if (dst->is_ssa) {
410
ncomp = dst->ssa.num_components;
411
} else {
412
ncomp = 0;
413
for (int i = 0; i < 4; i++)
414
ncomp += !!(alu->dest.write_mask & 1 << i);
415
}
416
417
instr = instr_create_alu(ctx, alu->op, ncomp);
418
set_index(ctx, dst, instr);
419
instr->alu.saturate = alu->dest.saturate;
420
instr->alu.write_mask = alu->dest.write_mask;
421
422
for (int i = 0; i < info->num_inputs; i++) {
423
nir_alu_src *src = &alu->src[i];
424
425
/* compress swizzle with writemask when applicable */
426
unsigned swiz = 0, j = 0;
427
for (int i = 0; i < 4; i++) {
428
if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
429
continue;
430
swiz |= swiz_set(src->swizzle[i], j++);
431
}
432
433
instr->src[i] = make_src(ctx, src->src);
434
instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
435
instr->src[i].negate = src->negate;
436
instr->src[i].abs = src->abs;
437
}
438
439
/* workarounds for NIR ops that don't map directly to a2xx ops */
440
switch (alu->op) {
441
case nir_op_fneg:
442
instr->src[0].negate = 1;
443
break;
444
case nir_op_fabs:
445
instr->src[0].abs = 1;
446
break;
447
case nir_op_fsat:
448
instr->alu.saturate = 1;
449
break;
450
case nir_op_slt:
451
tmp = instr->src[0];
452
instr->src[0] = instr->src[1];
453
instr->src[1] = tmp;
454
break;
455
case nir_op_fcsel:
456
tmp = instr->src[1];
457
instr->src[1] = instr->src[2];
458
instr->src[2] = tmp;
459
break;
460
case nir_op_fsub:
461
instr->src[1].negate = !instr->src[1].negate;
462
break;
463
case nir_op_fdot2:
464
instr->src_count = 3;
465
instr->src[2] = ir2_zero(ctx);
466
break;
467
case nir_op_fsign: {
468
/* we need an extra instruction to deal with the zero case */
469
struct ir2_instr *tmp;
470
471
/* tmp = x == 0 ? 0 : 1 */
472
tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
473
tmp->src[0] = instr->src[0];
474
tmp->src[1] = ir2_zero(ctx);
475
tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1);
476
477
/* result = x >= 0 ? tmp : -tmp */
478
instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
479
instr->src[2] = instr->src[1];
480
instr->src[2].negate = true;
481
instr->src_count = 3;
482
} break;
483
default:
484
break;
485
}
486
}
487
488
static void
489
load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
490
{
491
struct ir2_instr *instr;
492
int slot = -1;
493
494
if (ctx->so->type == MESA_SHADER_VERTEX) {
495
instr = ir2_instr_create_fetch(ctx, dst, 0);
496
instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
497
instr->fetch.vtx.const_idx = 20 + (idx / 3);
498
instr->fetch.vtx.const_idx_sel = idx % 3;
499
return;
500
}
501
502
/* get slot from idx */
503
nir_foreach_shader_in_variable (var, ctx->nir) {
504
if (var->data.driver_location == idx) {
505
slot = var->data.location;
506
break;
507
}
508
}
509
assert(slot >= 0);
510
511
switch (slot) {
512
case VARYING_SLOT_POS:
513
/* need to extract xy with abs and add tile offset on a20x
514
* zw from fragcoord input (w inverted in fragment shader)
515
* TODO: only components that are required by fragment shader
516
*/
517
instr = instr_create_alu_reg(
518
ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
519
instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
520
instr->src[0].abs = true;
521
/* on a20x, C64 contains the tile offset */
522
instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
523
524
instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
525
instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
526
527
instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
528
instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
529
530
unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
531
instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
532
instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
533
break;
534
default:
535
instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
536
instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
537
break;
538
}
539
}
540
541
static unsigned
542
output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
543
{
544
int slot = -1;
545
unsigned idx = nir_intrinsic_base(intr);
546
nir_foreach_shader_out_variable (var, ctx->nir) {
547
if (var->data.driver_location == idx) {
548
slot = var->data.location;
549
break;
550
}
551
}
552
assert(slot != -1);
553
return slot;
554
}
555
556
static void
557
store_output(struct ir2_context *ctx, nir_src src, unsigned slot,
558
unsigned ncomp)
559
{
560
struct ir2_instr *instr;
561
unsigned idx = 0;
562
563
if (ctx->so->type == MESA_SHADER_VERTEX) {
564
switch (slot) {
565
case VARYING_SLOT_POS:
566
ctx->position = make_src(ctx, src);
567
idx = 62;
568
break;
569
case VARYING_SLOT_PSIZ:
570
ctx->so->writes_psize = true;
571
idx = 63;
572
break;
573
default:
574
/* find matching slot from fragment shader input */
575
for (idx = 0; idx < ctx->f->inputs_count; idx++)
576
if (ctx->f->inputs[idx].slot == slot)
577
break;
578
if (idx == ctx->f->inputs_count)
579
return;
580
}
581
} else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
582
/* only color output is implemented */
583
return;
584
}
585
586
instr = instr_create_alu(ctx, nir_op_mov, ncomp);
587
instr->src[0] = make_src(ctx, src);
588
instr->alu.export = idx;
589
}
590
591
static void
592
emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
593
{
594
struct ir2_instr *instr;
595
ASSERTED nir_const_value *const_offset;
596
unsigned idx;
597
598
switch (intr->intrinsic) {
599
case nir_intrinsic_load_input:
600
load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
601
break;
602
case nir_intrinsic_store_output:
603
store_output(ctx, intr->src[0], output_slot(ctx, intr),
604
intr->num_components);
605
break;
606
case nir_intrinsic_load_uniform:
607
const_offset = nir_src_as_const_value(intr->src[0]);
608
assert(const_offset); /* TODO can be false in ES2? */
609
idx = nir_intrinsic_base(intr);
610
idx += (uint32_t)const_offset[0].f32;
611
instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
612
instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
613
break;
614
case nir_intrinsic_discard:
615
case nir_intrinsic_discard_if:
616
instr = ir2_instr_create(ctx, IR2_ALU);
617
instr->alu.vector_opc = VECTOR_NONE;
618
if (intr->intrinsic == nir_intrinsic_discard_if) {
619
instr->alu.scalar_opc = KILLNEs;
620
instr->src[0] = make_src(ctx, intr->src[0]);
621
} else {
622
instr->alu.scalar_opc = KILLEs;
623
instr->src[0] = ir2_zero(ctx);
624
}
625
instr->alu.export = -1;
626
instr->src_count = 1;
627
ctx->so->has_kill = true;
628
break;
629
case nir_intrinsic_load_front_face:
630
/* gl_FrontFacing is in the sign of param.x
631
* rcp required because otherwise we can't differentiate -0.0 and +0.0
632
*/
633
ctx->so->need_param = true;
634
635
struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
636
tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
637
638
instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
639
instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
640
instr->src[1] = ir2_zero(ctx);
641
break;
642
case nir_intrinsic_load_point_coord:
643
/* param.zw (note: abs might be needed like fragcoord in param.xy?) */
644
ctx->so->need_param = true;
645
646
instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
647
instr->src[0] =
648
ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
649
break;
650
default:
651
compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
652
break;
653
}
654
}
655
656
static void
657
emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
658
{
659
bool is_rect = false, is_cube = false;
660
struct ir2_instr *instr;
661
nir_src *coord, *lod_bias;
662
663
coord = lod_bias = NULL;
664
665
for (unsigned i = 0; i < tex->num_srcs; i++) {
666
switch (tex->src[i].src_type) {
667
case nir_tex_src_coord:
668
coord = &tex->src[i].src;
669
break;
670
case nir_tex_src_bias:
671
case nir_tex_src_lod:
672
assert(!lod_bias);
673
lod_bias = &tex->src[i].src;
674
break;
675
default:
676
compile_error(ctx, "Unhandled NIR tex src type: %d\n",
677
tex->src[i].src_type);
678
return;
679
}
680
}
681
682
switch (tex->op) {
683
case nir_texop_tex:
684
case nir_texop_txb:
685
case nir_texop_txl:
686
break;
687
default:
688
compile_error(ctx, "unimplemented texop %d\n", tex->op);
689
return;
690
}
691
692
switch (tex->sampler_dim) {
693
case GLSL_SAMPLER_DIM_2D:
694
break;
695
case GLSL_SAMPLER_DIM_RECT:
696
is_rect = true;
697
break;
698
case GLSL_SAMPLER_DIM_CUBE:
699
is_cube = true;
700
break;
701
default:
702
compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
703
return;
704
}
705
706
struct ir2_src src_coord = make_src_noconst(ctx, *coord);
707
708
/* for cube maps
709
* tmp = cube(coord)
710
* tmp.xy = tmp.xy / |tmp.z| + 1.5
711
* coord = tmp.xyw
712
*/
713
if (is_cube) {
714
struct ir2_instr *rcp, *coord_xy;
715
unsigned reg_idx;
716
717
instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
718
instr->src[0] = src_coord;
719
instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
720
instr->src[1] = src_coord;
721
instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
722
723
reg_idx = instr->reg - ctx->reg; /* hacky */
724
725
rcp = instr_create_alu(ctx, nir_op_frcp, 1);
726
rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
727
rcp->src[0].abs = true;
728
729
coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
730
coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
731
coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
732
coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1);
733
734
src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
735
/* TODO: lod/bias transformed by src_coord.z ? */
736
}
737
738
instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
739
instr->src[0] = src_coord;
740
instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
741
instr->fetch.tex.is_cube = is_cube;
742
instr->fetch.tex.is_rect = is_rect;
743
instr->fetch.tex.samp_id = tex->sampler_index;
744
745
/* for lod/bias, we insert an extra src for the backend to deal with */
746
if (lod_bias) {
747
instr->src[1] = make_src_noconst(ctx, *lod_bias);
748
/* backend will use 2-3 components so apply swizzle */
749
swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
750
instr->src_count = 2;
751
}
752
}
753
754
static void
755
setup_input(struct ir2_context *ctx, nir_variable *in)
756
{
757
struct fd2_shader_stateobj *so = ctx->so;
758
ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1);
759
unsigned n = in->data.driver_location;
760
unsigned slot = in->data.location;
761
762
assert(array_len == 1);
763
764
/* handle later */
765
if (ctx->so->type == MESA_SHADER_VERTEX)
766
return;
767
768
if (ctx->so->type != MESA_SHADER_FRAGMENT)
769
compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
770
771
n = ctx->f->inputs_count++;
772
773
/* half of fragcoord from param reg, half from a varying */
774
if (slot == VARYING_SLOT_POS) {
775
ctx->f->fragcoord = n;
776
so->need_param = true;
777
}
778
779
ctx->f->inputs[n].slot = slot;
780
ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
781
782
/* in->data.interpolation?
783
* opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
784
*/
785
}
786
787
static void
788
emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef)
789
{
790
/* TODO we don't want to emit anything for undefs */
791
792
struct ir2_instr *instr;
793
794
instr = instr_create_alu_dest(
795
ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true});
796
instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
797
}
798
799
static void
800
emit_instr(struct ir2_context *ctx, nir_instr *instr)
801
{
802
switch (instr->type) {
803
case nir_instr_type_alu:
804
emit_alu(ctx, nir_instr_as_alu(instr));
805
break;
806
case nir_instr_type_deref:
807
/* ignored, handled as part of the intrinsic they are src to */
808
break;
809
case nir_instr_type_intrinsic:
810
emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
811
break;
812
case nir_instr_type_load_const:
813
/* dealt with when using nir_src */
814
break;
815
case nir_instr_type_tex:
816
emit_tex(ctx, nir_instr_as_tex(instr));
817
break;
818
case nir_instr_type_jump:
819
ctx->block_has_jump[ctx->block_idx] = true;
820
break;
821
case nir_instr_type_ssa_undef:
822
emit_undef(ctx, nir_instr_as_ssa_undef(instr));
823
break;
824
default:
825
break;
826
}
827
}
828
829
/* fragcoord.zw and a20x hw binning outputs */
830
static void
831
extra_position_exports(struct ir2_context *ctx, bool binning)
832
{
833
struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
834
835
if (ctx->f->fragcoord < 0 && !binning)
836
return;
837
838
instr = instr_create_alu(ctx, nir_op_fmax, 1);
839
instr->src[0] = ctx->position;
840
instr->src[0].swizzle = IR2_SWIZZLE_W;
841
instr->src[1] = ir2_zero(ctx);
842
843
rcp = instr_create_alu(ctx, nir_op_frcp, 1);
844
rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
845
846
sc = instr_create_alu(ctx, nir_op_fmul, 4);
847
sc->src[0] = ctx->position;
848
sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
849
850
wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
851
wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
852
wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
853
wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
854
855
/* fragcoord z/w */
856
if (ctx->f->fragcoord >= 0 && !binning) {
857
instr = instr_create_alu(ctx, nir_op_mov, 1);
858
instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
859
instr->alu.export = ctx->f->fragcoord;
860
861
instr = instr_create_alu(ctx, nir_op_mov, 1);
862
instr->src[0] = ctx->position;
863
instr->src[0].swizzle = IR2_SWIZZLE_W;
864
instr->alu.export = ctx->f->fragcoord;
865
instr->alu.write_mask = 2;
866
}
867
868
if (!binning)
869
return;
870
871
off = instr_create_alu(ctx, nir_op_fadd, 1);
872
off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
873
off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
874
875
/* 8 max set in freedreno_screen.. unneeded instrs patched out */
876
for (int i = 0; i < 8; i++) {
877
instr = instr_create_alu(ctx, nir_op_ffma, 4);
878
instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
879
instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
880
instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
881
instr->alu.export = 32;
882
883
instr = instr_create_alu(ctx, nir_op_ffma, 4);
884
instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
885
instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
886
instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
887
instr->alu.export = 33;
888
}
889
}
890
891
static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
892
893
static bool
894
emit_block(struct ir2_context *ctx, nir_block *block)
895
{
896
struct ir2_instr *instr;
897
nir_block *succs = block->successors[0];
898
899
ctx->block_idx = block->index;
900
901
nir_foreach_instr (instr, block)
902
emit_instr(ctx, instr);
903
904
if (!succs || !succs->index)
905
return false;
906
907
/* we want to be smart and always jump and have the backend cleanup
908
* but we are not, so there are two cases where jump is needed:
909
* loops (succs index lower)
910
* jumps (jump instruction seen in block)
911
*/
912
if (succs->index > block->index && !ctx->block_has_jump[block->index])
913
return false;
914
915
assert(block->successors[1] == NULL);
916
917
instr = ir2_instr_create(ctx, IR2_CF);
918
instr->cf.block_idx = succs->index;
919
/* XXX can't jump to a block with different predicate */
920
return true;
921
}
922
923
static void
924
emit_if(struct ir2_context *ctx, nir_if *nif)
925
{
926
unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
927
struct ir2_instr *instr;
928
929
/* XXX: blob seems to always use same register for condition */
930
931
instr = ir2_instr_create(ctx, IR2_ALU);
932
instr->src[0] = make_src(ctx, nif->condition);
933
instr->src_count = 1;
934
instr->ssa.ncomp = 1;
935
instr->alu.vector_opc = VECTOR_NONE;
936
instr->alu.scalar_opc = SCALAR_NONE;
937
instr->alu.export = -1;
938
instr->alu.write_mask = 1;
939
instr->pred = 0;
940
941
/* if nested, use PRED_SETNE_PUSHv */
942
if (pred) {
943
instr->alu.vector_opc = PRED_SETNE_PUSHv;
944
instr->src[1] = instr->src[0];
945
instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
946
instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
947
instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
948
instr->src_count = 2;
949
} else {
950
instr->alu.scalar_opc = PRED_SETNEs;
951
}
952
953
ctx->pred_idx = instr->idx;
954
ctx->pred = 3;
955
956
emit_cf_list(ctx, &nif->then_list);
957
958
/* TODO: if these is no else branch we don't need this
959
* and if the else branch is simple, can just flip ctx->pred instead
960
*/
961
instr = ir2_instr_create(ctx, IR2_ALU);
962
instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
963
instr->src_count = 1;
964
instr->ssa.ncomp = 1;
965
instr->alu.vector_opc = VECTOR_NONE;
966
instr->alu.scalar_opc = PRED_SET_INVs;
967
instr->alu.export = -1;
968
instr->alu.write_mask = 1;
969
instr->pred = 0;
970
ctx->pred_idx = instr->idx;
971
972
emit_cf_list(ctx, &nif->else_list);
973
974
/* restore predicate for nested predicates */
975
if (pred) {
976
instr = ir2_instr_create(ctx, IR2_ALU);
977
instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
978
instr->src_count = 1;
979
instr->ssa.ncomp = 1;
980
instr->alu.vector_opc = VECTOR_NONE;
981
instr->alu.scalar_opc = PRED_SET_POPs;
982
instr->alu.export = -1;
983
instr->alu.write_mask = 1;
984
instr->pred = 0;
985
ctx->pred_idx = instr->idx;
986
}
987
988
/* restore ctx->pred */
989
ctx->pred = pred;
990
}
991
992
/* get the highest block idx in the loop, so we know when
993
* we can free registers that are allocated outside the loop
994
*/
995
static unsigned
996
loop_last_block(struct exec_list *list)
997
{
998
nir_cf_node *node =
999
exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1000
switch (node->type) {
1001
case nir_cf_node_block:
1002
return nir_cf_node_as_block(node)->index;
1003
case nir_cf_node_if:
1004
assert(0); /* XXX could this ever happen? */
1005
return 0;
1006
case nir_cf_node_loop:
1007
return loop_last_block(&nir_cf_node_as_loop(node)->body);
1008
default:
1009
compile_error(ctx, "Not supported\n");
1010
return 0;
1011
}
1012
}
1013
1014
static void
1015
emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1016
{
1017
ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1018
emit_cf_list(ctx, &nloop->body);
1019
ctx->loop_depth--;
1020
}
1021
1022
static bool
1023
emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1024
{
1025
bool ret = false;
1026
foreach_list_typed (nir_cf_node, node, node, list) {
1027
ret = false;
1028
switch (node->type) {
1029
case nir_cf_node_block:
1030
ret = emit_block(ctx, nir_cf_node_as_block(node));
1031
break;
1032
case nir_cf_node_if:
1033
emit_if(ctx, nir_cf_node_as_if(node));
1034
break;
1035
case nir_cf_node_loop:
1036
emit_loop(ctx, nir_cf_node_as_loop(node));
1037
break;
1038
case nir_cf_node_function:
1039
compile_error(ctx, "Not supported\n");
1040
break;
1041
}
1042
}
1043
return ret;
1044
}
1045
1046
static void
1047
cleanup_binning(struct ir2_context *ctx)
1048
{
1049
assert(ctx->so->type == MESA_SHADER_VERTEX);
1050
1051
/* kill non-position outputs for binning variant */
1052
nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) {
1053
nir_foreach_instr_safe (instr, block) {
1054
if (instr->type != nir_instr_type_intrinsic)
1055
continue;
1056
1057
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1058
if (intr->intrinsic != nir_intrinsic_store_output)
1059
continue;
1060
1061
if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1062
nir_instr_remove(instr);
1063
}
1064
}
1065
1066
ir2_optimize_nir(ctx->nir, false);
1067
}
1068
1069
static bool
1070
ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
1071
{
1072
if (instr->type != nir_instr_type_alu)
1073
return false;
1074
1075
nir_alu_instr *alu = nir_instr_as_alu(instr);
1076
switch (alu->op) {
1077
case nir_op_frsq:
1078
case nir_op_frcp:
1079
case nir_op_flog2:
1080
case nir_op_fexp2:
1081
case nir_op_fsqrt:
1082
case nir_op_fcos:
1083
case nir_op_fsin:
1084
return true;
1085
default:
1086
break;
1087
}
1088
1089
return false;
1090
}
1091
1092
void
1093
ir2_nir_compile(struct ir2_context *ctx, bool binning)
1094
{
1095
struct fd2_shader_stateobj *so = ctx->so;
1096
1097
memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1098
1099
ctx->nir = nir_shader_clone(NULL, so->nir);
1100
1101
if (binning)
1102
cleanup_binning(ctx);
1103
1104
OPT_V(ctx->nir, nir_copy_prop);
1105
OPT_V(ctx->nir, nir_opt_dce);
1106
OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1107
1108
OPT_V(ctx->nir, nir_lower_int_to_float);
1109
OPT_V(ctx->nir, nir_lower_bool_to_float);
1110
while (OPT(ctx->nir, nir_opt_algebraic))
1111
;
1112
OPT_V(ctx->nir, nir_opt_algebraic_late);
1113
OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1114
1115
OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL);
1116
1117
OPT_V(ctx->nir, nir_lower_locals_to_regs);
1118
1119
OPT_V(ctx->nir, nir_convert_from_ssa, true);
1120
1121
OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1122
OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL);
1123
1124
OPT_V(ctx->nir, nir_opt_dce);
1125
1126
nir_sweep(ctx->nir);
1127
1128
if (FD_DBG(DISASM)) {
1129
debug_printf("----------------------\n");
1130
nir_print_shader(ctx->nir, stdout);
1131
debug_printf("----------------------\n");
1132
}
1133
1134
/* fd2_shader_stateobj init */
1135
if (so->type == MESA_SHADER_FRAGMENT) {
1136
ctx->f->fragcoord = -1;
1137
ctx->f->inputs_count = 0;
1138
memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1139
}
1140
1141
/* Setup inputs: */
1142
nir_foreach_shader_in_variable (in, ctx->nir)
1143
setup_input(ctx, in);
1144
1145
if (so->type == MESA_SHADER_FRAGMENT) {
1146
unsigned idx;
1147
for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1148
ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1149
update_range(ctx, &ctx->input[idx]);
1150
}
1151
/* assume we have param input and kill it later if not */
1152
ctx->input[idx].ncomp = 4;
1153
update_range(ctx, &ctx->input[idx]);
1154
} else {
1155
ctx->input[0].ncomp = 1;
1156
ctx->input[2].ncomp = 1;
1157
update_range(ctx, &ctx->input[0]);
1158
update_range(ctx, &ctx->input[2]);
1159
}
1160
1161
/* And emit the body: */
1162
nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1163
1164
nir_foreach_register (reg, &fxn->registers) {
1165
ctx->reg[reg->index].ncomp = reg->num_components;
1166
ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1167
}
1168
1169
nir_metadata_require(fxn, nir_metadata_block_index);
1170
emit_cf_list(ctx, &fxn->body);
1171
/* TODO emit_block(ctx, fxn->end_block); */
1172
1173
if (so->type == MESA_SHADER_VERTEX)
1174
extra_position_exports(ctx, binning);
1175
1176
ralloc_free(ctx->nir);
1177
1178
/* kill unused param input */
1179
if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1180
ctx->input[ctx->f->inputs_count].initialized = false;
1181
}
1182
1183