CoCalc -- ir3_gallium.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
⁴⁵⁷⁴ views
1
/*
2
 * Copyright (C) 2014 Rob Clark <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 *
23
 * Authors:
24
 *    Rob Clark <[email protected]>
25
 */
26

27
#include "pipe/p_screen.h"
28
#include "pipe/p_state.h"
29
#include "tgsi/tgsi_dump.h"
30
#include "tgsi/tgsi_parse.h"
31
#include "util/format/u_format.h"
32
#include "util/u_inlines.h"
33
#include "util/u_memory.h"
34
#include "util/u_string.h"
35

36
#include "nir/tgsi_to_nir.h"
37

38
#include "freedreno_context.h"
39
#include "freedreno_util.h"
40

41
#include "ir3/ir3_cache.h"
42
#include "ir3/ir3_compiler.h"
43
#include "ir3/ir3_gallium.h"
44
#include "ir3/ir3_nir.h"
45
#include "ir3/ir3_shader.h"
46

47
/**
48
 * The hardware cso for shader state
49
 *
50
 * Initially just a container for the ir3_shader, but this is where we'll
51
 * plumb in async compile.
52
 */
53
struct ir3_shader_state {
54
   struct ir3_shader *shader;
55

56
   /* Fence signalled when async compile is completed: */
57
   struct util_queue_fence ready;
58
};
59

60
/**
61
 * Should initial variants be compiled synchronously?
62
 *
63
 * The only case where pipe_debug_message() is used in the initial-variants
64
 * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
65
 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
66
 * compile the initial shader variant asynchronously.
67
 */
68
static bool
69
initial_variants_synchronous(struct fd_context *ctx)
70
{
71
   return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
72
          FD_DBG(SERIALC);
73
}
74

75
static void
76
dump_shader_info(struct ir3_shader_variant *v,
77
                 struct pipe_debug_callback *debug)
78
{
79
   if (!FD_DBG(SHADERDB))
80
      return;
81

82
   pipe_debug_message(
83
      debug, SHADER_INFO,
84
      "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
85
      "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
86
      "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
87
      "%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n",
88
      ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
89
      v->info.instrs_count - v->info.nops_count, v->info.mov_count,
90
      v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
91
      v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen,
92
      v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
93
      v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
94
      v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
95
      v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], v->info.sstall,
96
      v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
97
}
98

99
static void
100
upload_shader_variant(struct ir3_shader_variant *v)
101
{
102
   struct shader_info *info = &v->shader->nir->info;
103
   struct ir3_compiler *compiler = v->shader->compiler;
104

105
   assert(!v->bo);
106

107
   v->bo =
108
      fd_bo_new(compiler->dev, v->info.size, 0,
109
                "%s:%s", ir3_shader_stage(v), info->name);
110

111
   /* Always include shaders in kernel crash dumps. */
112
   fd_bo_mark_for_dump(v->bo);
113

114
   memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
115
}
116

117
struct ir3_shader_variant *
118
ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
119
                   bool binning_pass, struct pipe_debug_callback *debug)
120
{
121
   struct ir3_shader_variant *v;
122
   bool created = false;
123

124
   /* Some shader key values may not be used by a given ir3_shader (for
125
    * example, fragment shader saturates in the vertex shader), so clean out
126
    * those flags to avoid recompiling.
127
    */
128
   ir3_key_clear_unused(&key, shader);
129

130
   v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
131

132
   if (created) {
133
      if (shader->initial_variants_done) {
134
         pipe_debug_message(debug, SHADER_INFO,
135
                            "%s shader: recompiling at draw time: global "
136
                            "0x%08x, vfsamples %x/%x, astc %x/%x\n",
137
                            ir3_shader_stage(v), key.global, key.vsamples,
138
                            key.fsamples, key.vastc_srgb, key.fastc_srgb);
139
      }
140

141
      dump_shader_info(v, debug);
142
      upload_shader_variant(v);
143

144
      if (v->binning) {
145
         upload_shader_variant(v->binning);
146
         dump_shader_info(v->binning, debug);
147
      }
148
   }
149

150
   return v;
151
}
152

153
static void
154
copy_stream_out(struct ir3_stream_output_info *i,
155
                const struct pipe_stream_output_info *p)
156
{
157
   STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
158
   STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
159

160
   i->num_outputs = p->num_outputs;
161
   for (int n = 0; n < ARRAY_SIZE(i->stride); n++)
162
      i->stride[n] = p->stride[n];
163

164
   for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
165
      i->output[n].register_index = p->output[n].register_index;
166
      i->output[n].start_component = p->output[n].start_component;
167
      i->output[n].num_components = p->output[n].num_components;
168
      i->output[n].output_buffer = p->output[n].output_buffer;
169
      i->output[n].dst_offset = p->output[n].dst_offset;
170
      i->output[n].stream = p->output[n].stream;
171
   }
172
}
173

174
static void
175
create_initial_variants(struct ir3_shader_state *hwcso,
176
                        struct pipe_debug_callback *debug)
177
{
178
   struct ir3_shader *shader = hwcso->shader;
179
   struct ir3_compiler *compiler = shader->compiler;
180
   nir_shader *nir = shader->nir;
181

182
   /* Compile standard variants immediately to try to avoid draw-time stalls
183
    * to run the compiler.
184
    */
185
   struct ir3_shader_key key = {
186
      .tessellation = IR3_TESS_NONE,
187
      .ucp_enables = MASK(nir->info.clip_distance_array_size),
188
      .msaa = true,
189
   };
190

191
   switch (nir->info.stage) {
192
   case MESA_SHADER_TESS_EVAL:
193
      key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode);
194
      break;
195

196
   case MESA_SHADER_TESS_CTRL:
197
      /* The primitive_mode field, while it exists for TCS, is not
198
       * populated (since separable shaders between TCS/TES are legal,
199
       * so TCS wouldn't have access to TES's declaration).  Make a
200
       * guess so that we shader-db something plausible for TCS.
201
       */
202
      if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
203
         key.tessellation = IR3_TESS_TRIANGLES;
204
      else
205
         key.tessellation = IR3_TESS_ISOLINES;
206
      break;
207

208
   case MESA_SHADER_GEOMETRY:
209
      key.has_gs = true;
210
      break;
211

212
   default:
213
      break;
214
   }
215

216
   key.safe_constlen = false;
217
   struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
218
   if (!v)
219
      return;
220

221
   if (v->constlen > compiler->max_const_safe) {
222
      key.safe_constlen = true;
223
      ir3_shader_variant(shader, key, false, debug);
224
   }
225

226
   /* For vertex shaders, also compile initial binning pass shader: */
227
   if (nir->info.stage == MESA_SHADER_VERTEX) {
228
      key.safe_constlen = false;
229
      v = ir3_shader_variant(shader, key, true, debug);
230
      if (!v)
231
         return;
232

233
      if (v->constlen > compiler->max_const_safe) {
234
         key.safe_constlen = true;
235
         ir3_shader_variant(shader, key, true, debug);
236
      }
237
   }
238

239
   shader->initial_variants_done = true;
240
}
241

242
static void
243
create_initial_variants_async(void *job, void *gdata, int thread_index)
244
{
245
   struct ir3_shader_state *hwcso = job;
246
   struct pipe_debug_callback debug = {};
247

248
   create_initial_variants(hwcso, &debug);
249
}
250

251
static void
252
create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
253
{
254
   struct ir3_shader_state *hwcso = job;
255
   struct ir3_shader *shader = hwcso->shader;
256
   struct pipe_debug_callback debug = {};
257
   static struct ir3_shader_key key; /* static is implicitly zeroed */
258

259
   ir3_shader_variant(shader, key, false, &debug);
260
   shader->initial_variants_done = true;
261
}
262

263
/* a bit annoying that compute-shader and normal shader state objects
264
 * aren't a bit more aligned.
265
 */
266
void *
267
ir3_shader_compute_state_create(struct pipe_context *pctx,
268
                                const struct pipe_compute_state *cso)
269
{
270
   struct fd_context *ctx = fd_context(pctx);
271

272
   /* req_input_mem will only be non-zero for cl kernels (ie. clover).
273
    * This isn't a perfect test because I guess it is possible (but
274
    * uncommon) for none for the kernel parameters to be a global,
275
    * but ctx->set_global_bindings() can't fail, so this is the next
276
    * best place to fail if we need a newer version of kernel driver:
277
    */
278
   if ((cso->req_input_mem > 0) &&
279
       fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
280
      return NULL;
281
   }
282

283
   struct ir3_compiler *compiler = ctx->screen->compiler;
284
   nir_shader *nir;
285

286
   if (cso->ir_type == PIPE_SHADER_IR_NIR) {
287
      /* we take ownership of the reference: */
288
      nir = (nir_shader *)cso->prog;
289
   } else {
290
      debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
291
      if (ir3_shader_debug & IR3_DBG_DISASM) {
292
         tgsi_dump(cso->prog, 0);
293
      }
294
      nir = tgsi_to_nir(cso->prog, pctx->screen, false);
295
   }
296

297
   struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
298
   struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
299

300
   util_queue_fence_init(&hwcso->ready);
301
   hwcso->shader = shader;
302

303
   /* Immediately compile a standard variant.  We have so few variants in our
304
    * shaders, that doing so almost eliminates draw-time recompiles.  (This
305
    * is also how we get data from shader-db's ./run)
306
    */
307

308
   if (initial_variants_synchronous(ctx)) {
309
      static struct ir3_shader_key key; /* static is implicitly zeroed */
310
      ir3_shader_variant(shader, key, false, &ctx->debug);
311
      shader->initial_variants_done = true;
312
   } else {
313
      struct fd_screen *screen = ctx->screen;
314
      util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
315
                         create_initial_compute_variants_async, NULL, 0);
316
   }
317

318
   return hwcso;
319
}
320

321
void *
322
ir3_shader_state_create(struct pipe_context *pctx,
323
                        const struct pipe_shader_state *cso)
324
{
325
   struct fd_context *ctx = fd_context(pctx);
326
   struct ir3_compiler *compiler = ctx->screen->compiler;
327
   struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
328

329
   /*
330
    * Convert to nir (if necessary):
331
    */
332

333
   nir_shader *nir;
334
   if (cso->type == PIPE_SHADER_IR_NIR) {
335
      /* we take ownership of the reference: */
336
      nir = cso->ir.nir;
337
   } else {
338
      debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
339
      if (ir3_shader_debug & IR3_DBG_DISASM) {
340
         tgsi_dump(cso->tokens, 0);
341
      }
342
      nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
343
   }
344

345
   /*
346
    * Create ir3_shader:
347
    *
348
    * This part is cheap, it doesn't compile initial variants
349
    */
350

351
   struct ir3_stream_output_info stream_output = {};
352
   copy_stream_out(&stream_output, &cso->stream_output);
353

354
   hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);
355

356
   /*
357
    * Create initial variants to avoid draw-time stalls.  This is
358
    * normally done asynchronously, unless debug is enabled (which
359
    * will be the case for shader-db)
360
    */
361

362
   util_queue_fence_init(&hwcso->ready);
363

364
   if (initial_variants_synchronous(ctx)) {
365
      create_initial_variants(hwcso, &ctx->debug);
366
   } else {
367
      util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
368
                         create_initial_variants_async, NULL, 0);
369
   }
370

371
   return hwcso;
372
}
373

374
void
375
ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
376
{
377
   struct fd_context *ctx = fd_context(pctx);
378
   struct fd_screen *screen = ctx->screen;
379
   struct ir3_shader_state *hwcso = _hwcso;
380
   struct ir3_shader *so = hwcso->shader;
381

382
   ir3_cache_invalidate(ctx->shader_cache, hwcso);
383

384
   /* util_queue_drop_job() guarantees that either:
385
    *  1) job did not execute
386
    *  2) job completed
387
    *
388
    * In either case the fence is signaled
389
    */
390
   util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
391

392
   /* free the uploaded shaders, since this is handled outside of the
393
    * shared ir3 code (ie. not used by turnip):
394
    */
395
   for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
396
      fd_bo_del(v->bo);
397
      v->bo = NULL;
398

399
      if (v->binning && v->binning->bo) {
400
         fd_bo_del(v->binning->bo);
401
         v->binning->bo = NULL;
402
      }
403
   }
404

405
   ir3_shader_destroy(so);
406
   util_queue_fence_destroy(&hwcso->ready);
407
   free(hwcso);
408
}
409

410
struct ir3_shader *
411
ir3_get_shader(struct ir3_shader_state *hwcso)
412
{
413
   if (!hwcso)
414
      return NULL;
415

416
   struct ir3_shader *shader = hwcso->shader;
417
   perf_time (1000, "waited for %s:%s:%s variants",
418
              _mesa_shader_stage_to_abbrev(shader->type),
419
              shader->nir->info.name,
420
              shader->nir->info.label) {
421
      /* wait for initial variants to compile: */
422
      util_queue_fence_wait(&hwcso->ready);
423
   }
424

425
   return shader;
426
}
427

428
struct shader_info *
429
ir3_get_shader_info(struct ir3_shader_state *hwcso)
430
{
431
   if (!hwcso)
432
      return NULL;
433
   return &hwcso->shader->nir->info;
434
}
435

436
/* fixup dirty shader state in case some "unrelated" (from the state-
437
 * tracker's perspective) state change causes us to switch to a
438
 * different variant.
439
 */
440
void
441
ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
442
{
443
   struct fd_context *ctx = fd_context(pctx);
444

445
   if (!ir3_shader_key_equal(ctx->last.key, key)) {
446
      if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
447
         fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
448
                                 FD_DIRTY_SHADER_PROG);
449
      }
450

451
      if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
452
         fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
453
      }
454

455
      /* NOTE: currently only a6xx has gs/tess, but needs no
456
       * gs/tess specific lowering.
457
       */
458

459
      *ctx->last.key = *key;
460
   }
461
}
462

463
static void
464
ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize)
465
{
466
   struct fd_screen *screen = fd_screen(pscreen);
467

468
   ir3_nir_lower_io_to_temporaries(nir);
469
   ir3_finalize_nir(screen->compiler, nir);
470
}
471

472
static void
473
ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
474
                                    unsigned max_threads)
475
{
476
   struct fd_screen *screen = fd_screen(pscreen);
477

478
   /* This function doesn't allow a greater number of threads than
479
    * the queue had at its creation.
480
    */
481
   util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
482
}
483

484
static bool
485
ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
486
                                            void *shader,
487
                                            enum pipe_shader_type shader_type)
488
{
489
   struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
490

491
   return util_queue_fence_is_signalled(&hwcso->ready);
492
}
493

494
void
495
ir3_prog_init(struct pipe_context *pctx)
496
{
497
   pctx->create_vs_state = ir3_shader_state_create;
498
   pctx->delete_vs_state = ir3_shader_state_delete;
499

500
   pctx->create_tcs_state = ir3_shader_state_create;
501
   pctx->delete_tcs_state = ir3_shader_state_delete;
502

503
   pctx->create_tes_state = ir3_shader_state_create;
504
   pctx->delete_tes_state = ir3_shader_state_delete;
505

506
   pctx->create_gs_state = ir3_shader_state_create;
507
   pctx->delete_gs_state = ir3_shader_state_delete;
508

509
   pctx->create_fs_state = ir3_shader_state_create;
510
   pctx->delete_fs_state = ir3_shader_state_delete;
511
}
512

513
void
514
ir3_screen_init(struct pipe_screen *pscreen)
515
{
516
   struct fd_screen *screen = fd_screen(pscreen);
517

518
   screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id, false);
519

520
   /* TODO do we want to limit things to # of fast cores, or just limit
521
    * based on total # of both big and little cores.  The little cores
522
    * tend to be in-order and probably much slower for compiling than
523
    * big cores.  OTOH if they are sitting idle, maybe it is useful to
524
    * use them?
525
    */
526
   unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
527

528
   util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
529
                   UTIL_QUEUE_INIT_RESIZE_IF_FULL |
530
                      UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
531

532
   pscreen->finalize_nir = ir3_screen_finalize_nir;
533
   pscreen->set_max_shader_compiler_threads =
534
      ir3_set_max_shader_compiler_threads;
535
   pscreen->is_parallel_shader_compilation_finished =
536
      ir3_is_parallel_shader_compilation_finished;
537
}
538

539
void
540
ir3_screen_fini(struct pipe_screen *pscreen)
541
{
542
   struct fd_screen *screen = fd_screen(pscreen);
543

544
   util_queue_destroy(&screen->compile_queue);
545
   ir3_compiler_destroy(screen->compiler);
546
   screen->compiler = NULL;
547
}
548

549
void
550
ir3_update_max_tf_vtx(struct fd_context *ctx,
551
                      const struct ir3_shader_variant *v)
552
{
553
   struct fd_streamout_stateobj *so = &ctx->streamout;
554
   struct ir3_stream_output_info *info = &v->shader->stream_output;
555
   uint32_t maxvtxcnt = 0x7fffffff;
556

557
   if (v->shader->stream_output.num_outputs == 0)
558
      ctx->streamout.max_tf_vtx = 0;
559
   if (so->num_targets == 0)
560
      ctx->streamout.max_tf_vtx = 0;
561

562
   /* offset to write to is:
563
    *
564
    *   total_vtxcnt = vtxcnt + offsets[i]
565
    *   offset = total_vtxcnt * stride[i]
566
    *
567
    *   offset =   vtxcnt * stride[i]       ; calculated in shader
568
    *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
569
    *
570
    * assuming for each vtx, each target buffer will have data written
571
    * up to 'offset + stride[i]', that leaves maxvtxcnt as:
572
    *
573
    *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
574
    *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
575
    *
576
    * but shader is actually doing a less-than (rather than less-than-
577
    * equal) check, so we can drop the -stride[i].
578
    *
579
    * TODO is assumption about `offset + stride[i]` legit?
580
    */
581
   for (unsigned i = 0; i < so->num_targets; i++) {
582
      struct pipe_stream_output_target *target = so->targets[i];
583
      unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
584
      if (target) {
585
         uint32_t max = target->buffer_size / stride;
586
         maxvtxcnt = MIN2(maxvtxcnt, max);
587
      }
588
   }
589

590
   ctx->streamout.max_tf_vtx = maxvtxcnt;
591
}
592

593
Product

Resources

Company