CoCalc -- ir3_shader.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_shader.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (C) 2014 Rob Clark <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *    Rob Clark <[email protected]>
25
 */
26

27
#include "util/format/u_format.h"
28
#include "util/u_atomic.h"
29
#include "util/u_math.h"
30
#include "util/u_memory.h"
31
#include "util/u_string.h"
32

33
#include "drm/freedreno_drmif.h"
34

35
#include "ir3_assembler.h"
36
#include "ir3_compiler.h"
37
#include "ir3_nir.h"
38
#include "ir3_parser.h"
39
#include "ir3_shader.h"
40

41
#include "isa/isa.h"
42

43
#include "disasm.h"
44

45
int
46
ir3_glsl_type_size(const struct glsl_type *type, bool bindless)
47
{
48
   return glsl_count_attribute_slots(type, false);
49
}
50

51
/* for vertex shader, the inputs are loaded into registers before the shader
52
 * is executed, so max_regs from the shader instructions might not properly
53
 * reflect the # of registers actually used, especially in case passthrough
54
 * varyings.
55
 *
56
 * Likewise, for fragment shader, we can have some regs which are passed
57
 * input values but never touched by the resulting shader (ie. as result
58
 * of dead code elimination or simply because we don't know how to turn
59
 * the reg off.
60
 */
61
static void
62
fixup_regfootprint(struct ir3_shader_variant *v)
63
{
64
   unsigned i;
65

66
   for (i = 0; i < v->inputs_count; i++) {
67
      /* skip frag inputs fetch via bary.f since their reg's are
68
       * not written by gpu before shader starts (and in fact the
69
       * regid's might not even be valid)
70
       */
71
      if (v->inputs[i].bary)
72
         continue;
73

74
      /* ignore high regs that are global to all threads in a warp
75
       * (they exist by default) (a5xx+)
76
       */
77
      if (v->inputs[i].regid >= regid(48, 0))
78
         continue;
79

80
      if (v->inputs[i].compmask) {
81
         unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
82
         int32_t regid = v->inputs[i].regid + n;
83
         if (v->inputs[i].half) {
84
            if (!v->mergedregs) {
85
               v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
86
            } else {
87
               v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
88
            }
89
         } else {
90
            v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
91
         }
92
      }
93
   }
94

95
   for (i = 0; i < v->outputs_count; i++) {
96
      /* for ex, VS shaders with tess don't have normal varying outs: */
97
      if (!VALIDREG(v->outputs[i].regid))
98
         continue;
99
      int32_t regid = v->outputs[i].regid + 3;
100
      if (v->outputs[i].half) {
101
         if (!v->mergedregs) {
102
            v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
103
         } else {
104
            v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
105
         }
106
      } else {
107
         v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
108
      }
109
   }
110

111
   for (i = 0; i < v->num_sampler_prefetch; i++) {
112
      unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
113
      int32_t regid = v->sampler_prefetch[i].dst + n;
114
      if (v->sampler_prefetch[i].half_precision) {
115
         if (!v->mergedregs) {
116
            v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
117
         } else {
118
            v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
119
         }
120
      } else {
121
         v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
122
      }
123
   }
124
}
125

126
/* wrapper for ir3_assemble() which does some info fixup based on
127
 * shader state.  Non-static since used by ir3_cmdline too.
128
 */
129
void *
130
ir3_shader_assemble(struct ir3_shader_variant *v)
131
{
132
   const struct ir3_compiler *compiler = v->shader->compiler;
133
   struct ir3_info *info = &v->info;
134
   uint32_t *bin;
135

136
   ir3_collect_info(v);
137

138
   if (v->constant_data_size) {
139
      /* Make sure that where we're about to place the constant_data is safe
140
       * to indirectly upload from.
141
       */
142
      info->constant_data_offset =
143
         align(info->size, v->shader->compiler->const_upload_unit * 16);
144
      info->size = info->constant_data_offset + v->constant_data_size;
145
   }
146

147
   /* Pad out the size so that when turnip uploads the shaders in
148
    * sequence, the starting offset of the next one is properly aligned.
149
    */
150
   info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
151

152
   bin = isa_assemble(v);
153
   if (!bin)
154
      return NULL;
155

156
   /* Append the immediates after the end of the program.  This lets us emit
157
    * the immediates as an indirect load, while avoiding creating another BO.
158
    */
159
   if (v->constant_data_size)
160
      memcpy(&bin[info->constant_data_offset / 4], v->constant_data,
161
             v->constant_data_size);
162
   ralloc_free(v->constant_data);
163
   v->constant_data = NULL;
164

165
   /* NOTE: if relative addressing is used, we set constlen in
166
    * the compiler (to worst-case value) since we don't know in
167
    * the assembler what the max addr reg value can be:
168
    */
169
   v->constlen = MAX2(v->constlen, info->max_const + 1);
170

171
   if (v->constlen > ir3_const_state(v)->offsets.driver_param)
172
      v->need_driver_params = true;
173

174
   /* On a4xx and newer, constlen must be a multiple of 16 dwords even though
175
    * uploads are in units of 4 dwords. Round it up here to make calculations
176
    * regarding the shared constlen simpler.
177
    */
178
   if (compiler->gpu_id >= 400)
179
      v->constlen = align(v->constlen, 4);
180

181
   /* Use the per-wave layout by default on a6xx for compute shaders. It
182
    * should result in better performance when loads/stores are to a uniform
183
    * index.
184
    */
185
   v->pvtmem_per_wave = compiler->gpu_id >= 600 && !info->multi_dword_ldp_stp &&
186
                        v->type == MESA_SHADER_COMPUTE;
187

188
   fixup_regfootprint(v);
189

190
   return bin;
191
}
192

193
static bool
194
try_override_shader_variant(struct ir3_shader_variant *v,
195
                            const char *identifier)
196
{
197
   assert(ir3_shader_override_path);
198

199
   char *name =
200
      ralloc_asprintf(NULL, "%s/%s.asm", ir3_shader_override_path, identifier);
201

202
   FILE *f = fopen(name, "r");
203

204
   if (!f) {
205
      ralloc_free(name);
206
      return false;
207
   }
208

209
   struct ir3_kernel_info info;
210
   info.numwg = INVALID_REG;
211
   v->ir = ir3_parse(v, &info, f);
212

213
   fclose(f);
214

215
   if (!v->ir) {
216
      fprintf(stderr, "Failed to parse %s\n", name);
217
      exit(1);
218
   }
219

220
   v->bin = ir3_shader_assemble(v);
221
   if (!v->bin) {
222
      fprintf(stderr, "Failed to assemble %s\n", name);
223
      exit(1);
224
   }
225

226
   ralloc_free(name);
227
   return true;
228
}
229

230
static void
231
assemble_variant(struct ir3_shader_variant *v)
232
{
233
   v->bin = ir3_shader_assemble(v);
234

235
   bool dbg_enabled = shader_debug_enabled(v->shader->type);
236
   if (dbg_enabled || ir3_shader_override_path || v->disasm_info.write_disasm) {
237
      unsigned char sha1[21];
238
      char sha1buf[41];
239

240
      _mesa_sha1_compute(v->bin, v->info.size, sha1);
241
      _mesa_sha1_format(sha1buf, sha1);
242

243
      bool shader_overridden =
244
         ir3_shader_override_path && try_override_shader_variant(v, sha1buf);
245

246
      if (v->disasm_info.write_disasm) {
247
         char *stream_data = NULL;
248
         size_t stream_size = 0;
249
         FILE *stream = open_memstream(&stream_data, &stream_size);
250

251
         fprintf(stream,
252
                 "Native code%s for unnamed %s shader %s with sha1 %s:\n",
253
                 shader_overridden ? " (overridden)" : "", ir3_shader_stage(v),
254
                 v->shader->nir->info.name, sha1buf);
255
         ir3_shader_disasm(v, v->bin, stream);
256

257
         fclose(stream);
258

259
         v->disasm_info.disasm = ralloc_size(v->shader, stream_size + 1);
260
         memcpy(v->disasm_info.disasm, stream_data, stream_size);
261
         v->disasm_info.disasm[stream_size] = 0;
262
         free(stream_data);
263
      }
264

265
      if (dbg_enabled || shader_overridden) {
266
         char *stream_data = NULL;
267
         size_t stream_size = 0;
268
         FILE *stream = open_memstream(&stream_data, &stream_size);
269

270
         fprintf(stream,
271
                 "Native code%s for unnamed %s shader %s with sha1 %s:\n",
272
                 shader_overridden ? " (overridden)" : "", ir3_shader_stage(v),
273
                 v->shader->nir->info.name, sha1buf);
274
         if (v->shader->type == MESA_SHADER_FRAGMENT)
275
            fprintf(stream, "SIMD0\n");
276
         ir3_shader_disasm(v, v->bin, stream);
277
         fclose(stream);
278

279
         mesa_log_multiline(MESA_LOG_INFO, stream_data);
280
         free(stream_data);
281
      }
282
   }
283

284
   /* no need to keep the ir around beyond this point: */
285
   ir3_destroy(v->ir);
286
   v->ir = NULL;
287
}
288

289
static bool
290
compile_variant(struct ir3_shader_variant *v)
291
{
292
   int ret = ir3_compile_shader_nir(v->shader->compiler, v);
293
   if (ret) {
294
      mesa_loge("compile failed! (%s:%s)", v->shader->nir->info.name,
295
                v->shader->nir->info.label);
296
      return false;
297
   }
298

299
   assemble_variant(v);
300
   if (!v->bin) {
301
      mesa_loge("assemble failed! (%s:%s)", v->shader->nir->info.name,
302
                v->shader->nir->info.label);
303
      return false;
304
   }
305

306
   return true;
307
}
308

309
/*
310
 * For creating normal shader variants, 'nonbinning' is NULL.  For
311
 * creating binning pass shader, it is link to corresponding normal
312
 * (non-binning) variant.
313
 */
314
static struct ir3_shader_variant *
315
alloc_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
316
              struct ir3_shader_variant *nonbinning)
317
{
318
   void *mem_ctx = shader;
319
   /* hang the binning variant off it's non-binning counterpart instead
320
    * of the shader, to simplify the error cleanup paths
321
    */
322
   if (nonbinning)
323
      mem_ctx = nonbinning;
324
   struct ir3_shader_variant *v = rzalloc_size(mem_ctx, sizeof(*v));
325

326
   if (!v)
327
      return NULL;
328

329
   v->id = ++shader->variant_count;
330
   v->shader = shader;
331
   v->binning_pass = !!nonbinning;
332
   v->nonbinning = nonbinning;
333
   v->key = *key;
334
   v->type = shader->type;
335
   v->mergedregs = shader->compiler->gpu_id >= 600;
336

337
   if (!v->binning_pass)
338
      v->const_state = rzalloc_size(v, sizeof(*v->const_state));
339

340
   return v;
341
}
342

343
static bool
344
needs_binning_variant(struct ir3_shader_variant *v)
345
{
346
   if ((v->type == MESA_SHADER_VERTEX) && ir3_has_binning_vs(&v->key))
347
      return true;
348
   return false;
349
}
350

351
static struct ir3_shader_variant *
352
create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
353
               bool write_disasm)
354
{
355
   struct ir3_shader_variant *v = alloc_variant(shader, key, NULL);
356

357
   if (!v)
358
      goto fail;
359

360
   v->disasm_info.write_disasm = write_disasm;
361

362
   if (needs_binning_variant(v)) {
363
      v->binning = alloc_variant(shader, key, v);
364
      if (!v->binning)
365
         goto fail;
366
      v->binning->disasm_info.write_disasm = write_disasm;
367
   }
368

369
   if (ir3_disk_cache_retrieve(shader->compiler, v))
370
      return v;
371

372
   if (!shader->nir_finalized) {
373
      ir3_nir_post_finalize(shader->compiler, shader->nir);
374

375
      if (ir3_shader_debug & IR3_DBG_DISASM) {
376
         mesa_logi("dump nir%d: type=%d", shader->id, shader->type);
377
         nir_log_shaderi(shader->nir);
378
      }
379

380
      if (v->disasm_info.write_disasm) {
381
         v->disasm_info.nir = nir_shader_as_str(shader->nir, shader);
382
      }
383

384
      shader->nir_finalized = true;
385
   }
386

387
   if (!compile_variant(v))
388
      goto fail;
389

390
   if (needs_binning_variant(v) && !compile_variant(v->binning))
391
      goto fail;
392

393
   ir3_disk_cache_store(shader->compiler, v);
394

395
   return v;
396

397
fail:
398
   ralloc_free(v);
399
   return NULL;
400
}
401

402
static inline struct ir3_shader_variant *
403
shader_variant(struct ir3_shader *shader, const struct ir3_shader_key *key)
404
{
405
   struct ir3_shader_variant *v;
406

407
   for (v = shader->variants; v; v = v->next)
408
      if (ir3_shader_key_equal(key, &v->key))
409
         return v;
410

411
   return NULL;
412
}
413

414
struct ir3_shader_variant *
415
ir3_shader_get_variant(struct ir3_shader *shader,
416
                       const struct ir3_shader_key *key, bool binning_pass,
417
                       bool write_disasm, bool *created)
418
{
419
   mtx_lock(&shader->variants_lock);
420
   struct ir3_shader_variant *v = shader_variant(shader, key);
421

422
   if (!v) {
423
      /* compile new variant if it doesn't exist already: */
424
      v = create_variant(shader, key, write_disasm);
425
      if (v) {
426
         v->next = shader->variants;
427
         shader->variants = v;
428
         *created = true;
429
      }
430
   }
431

432
   if (v && binning_pass) {
433
      v = v->binning;
434
      assert(v);
435
   }
436

437
   mtx_unlock(&shader->variants_lock);
438

439
   return v;
440
}
441

442
void
443
ir3_shader_destroy(struct ir3_shader *shader)
444
{
445
   ralloc_free(shader->nir);
446
   mtx_destroy(&shader->variants_lock);
447
   ralloc_free(shader);
448
}
449

450
/**
451
 * Creates a bitmask of the used bits of the shader key by this particular
452
 * shader.  Used by the gallium driver to skip state-dependent recompiles when
453
 * possible.
454
 */
455
static void
456
ir3_setup_used_key(struct ir3_shader *shader)
457
{
458
   nir_shader *nir = shader->nir;
459
   struct shader_info *info = &nir->info;
460
   struct ir3_shader_key *key = &shader->key_mask;
461

462
   /* This key flag is just used to make for a cheaper ir3_shader_key_equal
463
    * check in the common case.
464
    */
465
   key->has_per_samp = true;
466

467
   key->safe_constlen = true;
468

469
   /* When clip/cull distances are natively supported, we only use
470
    * ucp_enables to determine whether to lower legacy clip planes to
471
    * gl_ClipDistance.
472
    */
473
   if (info->stage != MESA_SHADER_FRAGMENT || !shader->compiler->has_clip_cull)
474
      key->ucp_enables = 0xff;
475

476
   if (info->stage == MESA_SHADER_FRAGMENT) {
477
      key->fastc_srgb = ~0;
478
      key->fsamples = ~0;
479

480
      if (info->inputs_read & VARYING_BITS_COLOR) {
481
         key->rasterflat = true;
482
      }
483

484
      if (info->inputs_read & VARYING_BIT_LAYER) {
485
         key->layer_zero = true;
486
      }
487

488
      if (info->inputs_read & VARYING_BIT_VIEWPORT) {
489
         key->view_zero = true;
490
      }
491

492
      /* Only used for deciding on behavior of
493
       * nir_intrinsic_load_barycentric_sample, or the centroid demotion
494
       * on older HW.
495
       */
496
      key->msaa = info->fs.uses_sample_qualifier ||
497
                  (shader->compiler->gpu_id < 600 &&
498
                   (BITSET_TEST(info->system_values_read,
499
                                SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID) ||
500
                    BITSET_TEST(info->system_values_read,
501
                                SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID)));
502
   } else {
503
      key->tessellation = ~0;
504
      key->has_gs = true;
505

506
      if (info->stage == MESA_SHADER_VERTEX) {
507
         key->vastc_srgb = ~0;
508
         key->vsamples = ~0;
509
      }
510
   }
511
}
512

513
/* Given an array of constlen's, decrease some of them so that the sum stays
514
 * within "combined_limit" while trying to fairly share the reduction. Returns
515
 * a bitfield of which stages should be trimmed.
516
 */
517
static uint32_t
518
trim_constlens(unsigned *constlens, unsigned first_stage, unsigned last_stage,
519
               unsigned combined_limit, unsigned safe_limit)
520
{
521
   unsigned cur_total = 0;
522
   for (unsigned i = first_stage; i <= last_stage; i++) {
523
      cur_total += constlens[i];
524
   }
525

526
   unsigned max_stage = 0;
527
   unsigned max_const = 0;
528
   uint32_t trimmed = 0;
529

530
   while (cur_total > combined_limit) {
531
      for (unsigned i = first_stage; i <= last_stage; i++) {
532
         if (constlens[i] >= max_const) {
533
            max_stage = i;
534
            max_const = constlens[i];
535
         }
536
      }
537

538
      assert(max_const > safe_limit);
539
      trimmed |= 1 << max_stage;
540
      cur_total = cur_total - max_const + safe_limit;
541
      constlens[max_stage] = safe_limit;
542
   }
543

544
   return trimmed;
545
}
546

547
/* Figures out which stages in the pipeline to use the "safe" constlen for, in
548
 * order to satisfy all shared constlen limits.
549
 */
550
uint32_t
551
ir3_trim_constlen(struct ir3_shader_variant **variants,
552
                  const struct ir3_compiler *compiler)
553
{
554
   unsigned constlens[MESA_SHADER_STAGES] = {};
555

556
   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
557
      if (variants[i])
558
         constlens[i] = variants[i]->constlen;
559
   }
560

561
   uint32_t trimmed = 0;
562
   STATIC_ASSERT(MESA_SHADER_STAGES <= 8 * sizeof(trimmed));
563

564
   /* There are two shared limits to take into account, the geometry limit on
565
    * a6xx and the total limit. The frag limit on a6xx only matters for a
566
    * single stage, so it's always satisfied with the first variant.
567
    */
568
   if (compiler->gpu_id >= 600) {
569
      trimmed |=
570
         trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_GEOMETRY,
571
                        compiler->max_const_geom, compiler->max_const_safe);
572
   }
573
   trimmed |=
574
      trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_FRAGMENT,
575
                     compiler->max_const_pipeline, compiler->max_const_safe);
576

577
   return trimmed;
578
}
579

580
struct ir3_shader *
581
ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
582
                    unsigned reserved_user_consts,
583
                    struct ir3_stream_output_info *stream_output)
584
{
585
   struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
586

587
   mtx_init(&shader->variants_lock, mtx_plain);
588
   shader->compiler = compiler;
589
   shader->id = p_atomic_inc_return(&shader->compiler->shader_count);
590
   shader->type = nir->info.stage;
591
   if (stream_output)
592
      memcpy(&shader->stream_output, stream_output,
593
             sizeof(shader->stream_output));
594
   shader->num_reserved_user_consts = reserved_user_consts;
595
   shader->nir = nir;
596

597
   ir3_disk_cache_init_shader_key(compiler, shader);
598

599
   ir3_setup_used_key(shader);
600

601
   return shader;
602
}
603

604
static void
605
dump_reg(FILE *out, const char *name, uint32_t r)
606
{
607
   if (r != regid(63, 0)) {
608
      const char *reg_type = (r & HALF_REG_ID) ? "hr" : "r";
609
      fprintf(out, "; %s: %s%d.%c\n", name, reg_type, (r & ~HALF_REG_ID) >> 2,
610
              "xyzw"[r & 0x3]);
611
   }
612
}
613

614
static void
615
dump_output(FILE *out, struct ir3_shader_variant *so, unsigned slot,
616
            const char *name)
617
{
618
   uint32_t regid;
619
   regid = ir3_find_output_regid(so, slot);
620
   dump_reg(out, name, regid);
621
}
622

623
static const char *
624
input_name(struct ir3_shader_variant *so, int i)
625
{
626
   if (so->inputs[i].sysval) {
627
      return gl_system_value_name(so->inputs[i].slot);
628
   } else if (so->type == MESA_SHADER_VERTEX) {
629
      return gl_vert_attrib_name(so->inputs[i].slot);
630
   } else {
631
      return gl_varying_slot_name_for_stage(so->inputs[i].slot, so->type);
632
   }
633
}
634

635
static const char *
636
output_name(struct ir3_shader_variant *so, int i)
637
{
638
   if (so->type == MESA_SHADER_FRAGMENT) {
639
      return gl_frag_result_name(so->outputs[i].slot);
640
   } else {
641
      switch (so->outputs[i].slot) {
642
      case VARYING_SLOT_GS_HEADER_IR3:
643
         return "GS_HEADER";
644
      case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
645
         return "GS_VERTEX_FLAGS";
646
      case VARYING_SLOT_TCS_HEADER_IR3:
647
         return "TCS_HEADER";
648
      default:
649
         return gl_varying_slot_name_for_stage(so->outputs[i].slot, so->type);
650
      }
651
   }
652
}
653

654
void
655
ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
656
{
657
   struct ir3 *ir = so->ir;
658
   struct ir3_register *reg;
659
   const char *type = ir3_shader_stage(so);
660
   uint8_t regid;
661
   unsigned i;
662

663
   foreach_input_n (instr, i, ir) {
664
      reg = instr->dsts[0];
665
      regid = reg->num;
666
      fprintf(out, "@in(%sr%d.%c)\tin%d",
667
              (reg->flags & IR3_REG_HALF) ? "h" : "", (regid >> 2),
668
              "xyzw"[regid & 0x3], i);
669

670
      if (reg->wrmask > 0x1)
671
         fprintf(out, " (wrmask=0x%x)", reg->wrmask);
672
      fprintf(out, "\n");
673
   }
674

675
   /* print pre-dispatch texture fetches: */
676
   for (i = 0; i < so->num_sampler_prefetch; i++) {
677
      const struct ir3_sampler_prefetch *fetch = &so->sampler_prefetch[i];
678
      fprintf(out,
679
              "@tex(%sr%d.%c)\tsrc=%u, samp=%u, tex=%u, wrmask=0x%x, cmd=%u\n",
680
              fetch->half_precision ? "h" : "", fetch->dst >> 2,
681
              "xyzw"[fetch->dst & 0x3], fetch -> src, fetch -> samp_id,
682
              fetch -> tex_id, fetch -> wrmask, fetch -> cmd);
683
   }
684

685
   const struct ir3_const_state *const_state = ir3_const_state(so);
686
   for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) {
687
      fprintf(out, "@const(c%d.x)\t", const_state->offsets.immediate + i);
688
      fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
689
              const_state->immediates[i * 4 + 0],
690
              const_state->immediates[i * 4 + 1],
691
              const_state->immediates[i * 4 + 2],
692
              const_state->immediates[i * 4 + 3]);
693
   }
694

695
   isa_decode(bin, so->info.sizedwords * 4, out,
696
              &(struct isa_decode_options){
697
                 .gpu_id = ir->compiler->gpu_id,
698
                 .show_errors = true,
699
                 .branch_labels = true,
700
              });
701

702
   fprintf(out, "; %s: outputs:", type);
703
   for (i = 0; i < so->outputs_count; i++) {
704
      uint8_t regid = so->outputs[i].regid;
705
      const char *reg_type = so->outputs[i].half ? "hr" : "r";
706
      fprintf(out, " %s%d.%c (%s)", reg_type, (regid >> 2), "xyzw"[regid & 0x3],
707
              output_name(so, i));
708
   }
709
   fprintf(out, "\n");
710

711
   fprintf(out, "; %s: inputs:", type);
712
   for (i = 0; i < so->inputs_count; i++) {
713
      uint8_t regid = so->inputs[i].regid;
714
      fprintf(out, " r%d.%c (%s slot=%d cm=%x,il=%u,b=%u)", (regid >> 2),
715
              "xyzw"[regid & 0x3], input_name(so, i), so -> inputs[i].slot,
716
              so->inputs[i].compmask, so->inputs[i].inloc, so->inputs[i].bary);
717
   }
718
   fprintf(out, "\n");
719

720
   /* print generic shader info: */
721
   fprintf(
722
      out,
723
      "; %s prog %d/%d: %u instr, %u nops, %u non-nops, %u mov, %u cov, %u dwords\n",
724
      type, so->shader->id, so->id, so->info.instrs_count, so->info.nops_count,
725
      so->info.instrs_count - so->info.nops_count, so->info.mov_count,
726
      so->info.cov_count, so->info.sizedwords);
727

728
   fprintf(out,
729
           "; %s prog %d/%d: %u last-baryf, %d half, %d full, %u constlen\n",
730
           type, so->shader->id, so->id, so->info.last_baryf,
731
           so->info.max_half_reg + 1, so->info.max_reg + 1, so->constlen);
732

733
   fprintf(
734
      out,
735
      "; %s prog %d/%d: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, \n",
736
      type, so->shader->id, so->id, so->info.instrs_per_cat[0],
737
      so->info.instrs_per_cat[1], so->info.instrs_per_cat[2],
738
      so->info.instrs_per_cat[3], so->info.instrs_per_cat[4],
739
      so->info.instrs_per_cat[5], so->info.instrs_per_cat[6],
740
      so->info.instrs_per_cat[7]);
741

742
   fprintf(
743
      out,
744
      "; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
745
      type, so->shader->id, so->id, so->info.sstall, so->info.ss, so->info.sy,
746
      so->max_sun, so->loops);
747

748
   /* print shader type specific info: */
749
   switch (so->type) {
750
   case MESA_SHADER_VERTEX:
751
      dump_output(out, so, VARYING_SLOT_POS, "pos");
752
      dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
753
      break;
754
   case MESA_SHADER_FRAGMENT:
755
      dump_reg(out, "pos (ij_pixel)",
756
               ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL));
757
      dump_reg(
758
         out, "pos (ij_centroid)",
759
         ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID));
760
      dump_reg(out, "pos (ij_size)",
761
               ir3_find_sysval_regid(so, SYSTEM_VALUE_BARYCENTRIC_PERSP_SIZE));
762
      dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
763
      if (so->color0_mrt) {
764
         dump_output(out, so, FRAG_RESULT_COLOR, "color");
765
      } else {
766
         dump_output(out, so, FRAG_RESULT_DATA0, "data0");
767
         dump_output(out, so, FRAG_RESULT_DATA1, "data1");
768
         dump_output(out, so, FRAG_RESULT_DATA2, "data2");
769
         dump_output(out, so, FRAG_RESULT_DATA3, "data3");
770
         dump_output(out, so, FRAG_RESULT_DATA4, "data4");
771
         dump_output(out, so, FRAG_RESULT_DATA5, "data5");
772
         dump_output(out, so, FRAG_RESULT_DATA6, "data6");
773
         dump_output(out, so, FRAG_RESULT_DATA7, "data7");
774
      }
775
      dump_reg(out, "fragcoord",
776
               ir3_find_sysval_regid(so, SYSTEM_VALUE_FRAG_COORD));
777
      dump_reg(out, "fragface",
778
               ir3_find_sysval_regid(so, SYSTEM_VALUE_FRONT_FACE));
779
      break;
780
   default:
781
      /* TODO */
782
      break;
783
   }
784

785
   fprintf(out, "\n");
786
}
787

788
uint64_t
789
ir3_shader_outputs(const struct ir3_shader *so)
790
{
791
   return so->nir->info.outputs_written;
792
}
793

794
/* Add any missing varyings needed for stream-out.  Otherwise varyings not
795
 * used by fragment shader will be stripped out.
796
 */
797
void
798
ir3_link_stream_out(struct ir3_shader_linkage *l,
799
                    const struct ir3_shader_variant *v)
800
{
801
   const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
802

803
   /*
804
    * First, any stream-out varyings not already in linkage map (ie. also
805
    * consumed by frag shader) need to be added:
806
    */
807
   for (unsigned i = 0; i < strmout->num_outputs; i++) {
808
      const struct ir3_stream_output *out = &strmout->output[i];
809
      unsigned k = out->register_index;
810
      unsigned compmask =
811
         (1 << (out->num_components + out->start_component)) - 1;
812
      unsigned idx, nextloc = 0;
813

814
      /* psize/pos need to be the last entries in linkage map, and will
815
       * get added link_stream_out, so skip over them:
816
       */
817
      if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) ||
818
          (v->outputs[k].slot == VARYING_SLOT_POS))
819
         continue;
820

821
      for (idx = 0; idx < l->cnt; idx++) {
822
         if (l->var[idx].regid == v->outputs[k].regid)
823
            break;
824
         nextloc = MAX2(nextloc, l->var[idx].loc + 4);
825
      }
826

827
      /* add if not already in linkage map: */
828
      if (idx == l->cnt)
829
         ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
830

831
      /* expand component-mask if needed, ie streaming out all components
832
       * but frag shader doesn't consume all components:
833
       */
834
      if (compmask & ~l->var[idx].compmask) {
835
         l->var[idx].compmask |= compmask;
836
         l->max_loc = MAX2(
837
            l->max_loc, l->var[idx].loc + util_last_bit(l->var[idx].compmask));
838
      }
839
   }
840
}
841

842
Product

Resources

Company