CoCalc -- radv

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_shader.c
⁷²³³ views
1
/*
2
 * Copyright © 2016 Red Hat.
3
 * Copyright © 2016 Bas Nieuwenhuizen
4
 *
5
 * based in part on anv driver which is:
6
 * Copyright © 2015 Intel Corporation
7
 *
8
 * Permission is hereby granted, free of charge, to any person obtaining a
9
 * copy of this software and associated documentation files (the "Software"),
10
 * to deal in the Software without restriction, including without limitation
11
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
 * and/or sell copies of the Software, and to permit persons to whom the
13
 * Software is furnished to do so, subject to the following conditions:
14
 *
15
 * The above copyright notice and this permission notice (including the next
16
 * paragraph) shall be included in all copies or substantial portions of the
17
 * Software.
18
 *
19
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
 * IN THE SOFTWARE.
26
 */
27

28
#include "radv_shader.h"
29
#include "nir/nir.h"
30
#include "nir/nir_builder.h"
31
#include "spirv/nir_spirv.h"
32
#include "util/memstream.h"
33
#include "util/mesa-sha1.h"
34
#include "util/u_atomic.h"
35
#include "radv_debug.h"
36
#include "radv_private.h"
37
#include "radv_shader_args.h"
38
#include "radv_shader_helper.h"
39

40
#include "util/debug.h"
41
#include "ac_binary.h"
42
#include "ac_exp_param.h"
43
#include "ac_llvm_util.h"
44
#include "ac_nir.h"
45
#include "ac_rtld.h"
46
#include "aco_interface.h"
47
#include "sid.h"
48
#include "vk_format.h"
49

50
static const struct nir_shader_compiler_options nir_options = {
51
   .vertex_id_zero_based = true,
52
   .lower_scmp = true,
53
   .lower_flrp16 = true,
54
   .lower_flrp32 = true,
55
   .lower_flrp64 = true,
56
   .lower_device_index_to_zero = true,
57
   .lower_fdiv = true,
58
   .lower_fmod = true,
59
   .lower_ineg = true,
60
   .lower_bitfield_insert_to_bitfield_select = true,
61
   .lower_bitfield_extract = true,
62
   .lower_pack_snorm_2x16 = true,
63
   .lower_pack_snorm_4x8 = true,
64
   .lower_pack_unorm_2x16 = true,
65
   .lower_pack_unorm_4x8 = true,
66
   .lower_pack_half_2x16 = true,
67
   .lower_pack_64_2x32 = true,
68
   .lower_pack_64_4x16 = true,
69
   .lower_pack_32_2x16 = true,
70
   .lower_unpack_snorm_2x16 = true,
71
   .lower_unpack_snorm_4x8 = true,
72
   .lower_unpack_unorm_2x16 = true,
73
   .lower_unpack_unorm_4x8 = true,
74
   .lower_unpack_half_2x16 = true,
75
   .lower_ffma16 = true,
76
   .lower_ffma32 = true,
77
   .lower_ffma64 = true,
78
   .lower_fpow = true,
79
   .lower_mul_2x32_64 = true,
80
   .lower_rotate = true,
81
   .has_fsub = true,
82
   .has_isub = true,
83
   .use_scoped_barrier = true,
84
   .max_unroll_iterations = 32,
85
   .max_unroll_iterations_aggressive = 128,
86
   .use_interpolated_input_intrinsics = true,
87
   .vectorize_vec2_16bit = true,
88
   /* nir_lower_int64() isn't actually called for the LLVM backend, but
89
    * this helps the loop unrolling heuristics. */
90
   .lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
91
                          nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64,
92
   .lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv,
93
   .divergence_analysis_options = nir_divergence_view_index_uniform,
94
};
95

96
bool
97
radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module,
98
                     bool meta_shader)
99
{
100
   if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
101
      return false;
102
   if (module)
103
      return !module->nir || (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS);
104

105
   return meta_shader;
106
}
107

108
bool
109
radv_can_dump_shader_stats(struct radv_device *device, struct vk_shader_module *module)
110
{
111
   /* Only dump non-meta shader stats. */
112
   return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && module && !module->nir;
113
}
114

115
void
116
radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
117
                  bool optimize_conservatively, bool allow_copies)
118
{
119
   bool progress;
120
   unsigned lower_flrp = (shader->options->lower_flrp16 ? 16 : 0) |
121
                         (shader->options->lower_flrp32 ? 32 : 0) |
122
                         (shader->options->lower_flrp64 ? 64 : 0);
123

124
   do {
125
      progress = false;
126

127
      NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp);
128
      NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp);
129

130
      NIR_PASS_V(shader, nir_lower_vars_to_ssa);
131

132
      if (allow_copies) {
133
         /* Only run this pass in the first call to
134
          * radv_optimize_nir.  Later calls assume that we've
135
          * lowered away any copy_deref instructions and we
136
          *  don't want to introduce any more.
137
          */
138
         NIR_PASS(progress, shader, nir_opt_find_array_copies);
139
      }
140

141
      NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
142
      NIR_PASS(progress, shader, nir_opt_dead_write_vars);
143
      NIR_PASS(progress, shader, nir_remove_dead_variables,
144
               nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);
145

146
      NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL);
147
      NIR_PASS_V(shader, nir_lower_phis_to_scalar, true);
148

149
      NIR_PASS(progress, shader, nir_copy_prop);
150
      NIR_PASS(progress, shader, nir_opt_remove_phis);
151
      NIR_PASS(progress, shader, nir_opt_dce);
152
      if (nir_opt_trivial_continues(shader)) {
153
         progress = true;
154
         NIR_PASS(progress, shader, nir_copy_prop);
155
         NIR_PASS(progress, shader, nir_opt_remove_phis);
156
         NIR_PASS(progress, shader, nir_opt_dce);
157
      }
158
      NIR_PASS(progress, shader, nir_opt_if, true);
159
      NIR_PASS(progress, shader, nir_opt_dead_cf);
160
      NIR_PASS(progress, shader, nir_opt_cse);
161
      NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
162
      NIR_PASS(progress, shader, nir_opt_constant_folding);
163
      NIR_PASS(progress, shader, nir_opt_algebraic);
164

165
      if (lower_flrp != 0) {
166
         bool lower_flrp_progress = false;
167
         NIR_PASS(lower_flrp_progress, shader, nir_lower_flrp, lower_flrp,
168
                  false /* always_precise */);
169
         if (lower_flrp_progress) {
170
            NIR_PASS(progress, shader, nir_opt_constant_folding);
171
            progress = true;
172
         }
173

174
         /* Nothing should rematerialize any flrps, so we only
175
          * need to do this lowering once.
176
          */
177
         lower_flrp = 0;
178
      }
179

180
      NIR_PASS(progress, shader, nir_opt_undef);
181
      NIR_PASS(progress, shader, nir_opt_shrink_vectors,
182
               !device->instance->disable_shrink_image_store);
183
      if (shader->options->max_unroll_iterations) {
184
         NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
185
      }
186
   } while (progress && !optimize_conservatively);
187

188
   NIR_PASS(progress, shader, nir_opt_conditional_discard);
189
   NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo);
190
}
191

192
void
193
radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
194
{
195
   bool more_algebraic = true;
196
   while (more_algebraic) {
197
      more_algebraic = false;
198
      NIR_PASS_V(nir, nir_copy_prop);
199
      NIR_PASS_V(nir, nir_opt_dce);
200
      NIR_PASS_V(nir, nir_opt_constant_folding);
201
      NIR_PASS_V(nir, nir_opt_cse);
202
      NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
203
   }
204

205
   if (opt_offsets)
206
      NIR_PASS_V(nir, nir_opt_offsets);
207

208
   /* Do late algebraic optimization to turn add(a,
209
    * neg(b)) back into subs, then the mandatory cleanup
210
    * after algebraic.  Note that it may produce fnegs,
211
    * and if so then we need to keep running to squash
212
    * fneg(fneg(a)).
213
    */
214
   bool more_late_algebraic = true;
215
   while (more_late_algebraic) {
216
      more_late_algebraic = false;
217
      NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
218
      NIR_PASS_V(nir, nir_opt_constant_folding);
219
      NIR_PASS_V(nir, nir_copy_prop);
220
      NIR_PASS_V(nir, nir_opt_dce);
221
      NIR_PASS_V(nir, nir_opt_cse);
222
   }
223
}
224

225
static void
226
shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
227
{
228
   assert(glsl_type_is_vector_or_scalar(type));
229

230
   uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
231
   unsigned length = glsl_get_vector_elements(type);
232
   *size = comp_size * length, *align = comp_size;
233
}
234

235
struct radv_shader_debug_data {
236
   struct radv_device *device;
237
   const struct vk_shader_module *module;
238
};
239

240
static void
241
radv_spirv_nir_debug(void *private_data, enum nir_spirv_debug_level level, size_t spirv_offset,
242
                     const char *message)
243
{
244
   struct radv_shader_debug_data *debug_data = private_data;
245
   struct radv_instance *instance = debug_data->device->instance;
246

247
   static const VkDebugReportFlagsEXT vk_flags[] = {
248
      [NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,
249
      [NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,
250
      [NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
251
   };
252
   char buffer[256];
253

254
   snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long)spirv_offset, message);
255

256
   vk_debug_report(&instance->vk, vk_flags[level], &debug_data->module->base, 0, 0, "radv", buffer);
257
}
258

259
static void
260
radv_compiler_debug(void *private_data, enum radv_compiler_debug_level level, const char *message)
261
{
262
   struct radv_shader_debug_data *debug_data = private_data;
263
   struct radv_instance *instance = debug_data->device->instance;
264

265
   static const VkDebugReportFlagsEXT vk_flags[] = {
266
      [RADV_COMPILER_DEBUG_LEVEL_PERFWARN] = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
267
      [RADV_COMPILER_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
268
   };
269

270
   /* VK_DEBUG_REPORT_DEBUG_BIT_EXT specifies diagnostic information
271
    * from the implementation and layers.
272
    */
273
   vk_debug_report(&instance->vk, vk_flags[level] | VK_DEBUG_REPORT_DEBUG_BIT_EXT,
274
                   &debug_data->module->base, 0, 0, "radv", message);
275
}
276

277
static nir_ssa_def *
278
convert_pointer_to_64(nir_builder *b, const struct radv_physical_device *pdev, nir_ssa_def *ptr)
279
{
280
   nir_ssa_def *comp[] = {ptr, nir_imm_int(b, pdev->rad_info.address32_hi)};
281
   return nir_pack_64_2x32(b, nir_vec(b, comp, 2));
282
}
283

284
static bool
285
lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
286
                 const struct radv_pipeline_layout *layout, const struct radv_physical_device *pdev)
287
{
288
   nir_function_impl *entry = nir_shader_get_entrypoint(nir);
289
   bool progress = false;
290
   nir_builder b;
291

292
   nir_builder_init(&b, entry);
293

294
   nir_foreach_block (block, entry) {
295
      nir_foreach_instr_safe (instr, block) {
296
         if (instr->type != nir_instr_type_intrinsic)
297
            continue;
298

299
         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
300
         b.cursor = nir_before_instr(&intrin->instr);
301

302
         nir_ssa_def *def = NULL;
303
         switch (intrin->intrinsic) {
304
         case nir_intrinsic_load_vulkan_descriptor:
305
            if (nir_intrinsic_desc_type(intrin) == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) {
306
               nir_ssa_def *addr =
307
                  convert_pointer_to_64(&b, pdev,
308
                                        nir_iadd(&b, nir_channels(&b, intrin->src[0].ssa, 1),
309
                                                 nir_channels(&b, intrin->src[0].ssa, 2)));
310

311
               def = nir_build_load_global(&b, 1, 64, addr, .access = ACCESS_NON_WRITEABLE,
312
                                           .align_mul = 8, .align_offset = 0);
313
            } else {
314
               def = nir_vec3(&b, nir_channel(&b, intrin->src[0].ssa, 0),
315
                              nir_channel(&b, intrin->src[0].ssa, 1), nir_imm_int(&b, 0));
316
            }
317
            break;
318
         case nir_intrinsic_vulkan_resource_index: {
319
            unsigned desc_set = nir_intrinsic_desc_set(intrin);
320
            unsigned binding = nir_intrinsic_binding(intrin);
321
            struct radv_descriptor_set_layout *desc_layout = layout->set[desc_set].layout;
322

323
            nir_ssa_def *new_res = nir_vulkan_resource_index(
324
               &b, 3, 32, intrin->src[0].ssa, .desc_set = desc_set, .binding = binding,
325
               .desc_type = nir_intrinsic_desc_type(intrin));
326
            nir_ssa_def *set_ptr = nir_channel(&b, new_res, 0);
327
            nir_ssa_def *binding_ptr = nir_channel(&b, new_res, 1);
328

329
            nir_ssa_def *stride;
330
            if (desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
331
                desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
332
               stride = nir_imm_int(&b, 16);
333
            } else {
334
               stride = nir_imm_int(&b, desc_layout->binding[binding].size);
335
            }
336
            def = nir_vec3(&b, set_ptr, binding_ptr, stride);
337
            break;
338
         }
339
         case nir_intrinsic_vulkan_resource_reindex: {
340
            nir_ssa_def *set_ptr = nir_channel(&b, intrin->src[0].ssa, 0);
341
            nir_ssa_def *binding_ptr = nir_channel(&b, intrin->src[0].ssa, 1);
342
            nir_ssa_def *stride = nir_channel(&b, intrin->src[0].ssa, 2);
343
            binding_ptr = nir_iadd(&b, binding_ptr, nir_imul(&b, intrin->src[1].ssa, stride));
344
            def = nir_vec3(&b, set_ptr, binding_ptr, stride);
345
            break;
346
         }
347
         case nir_intrinsic_is_sparse_texels_resident:
348
            def = nir_ieq_imm(&b, intrin->src[0].ssa, 0);
349
            break;
350
         case nir_intrinsic_sparse_residency_code_and:
351
            def = nir_ior(&b, intrin->src[0].ssa, intrin->src[1].ssa);
352
            break;
353
         case nir_intrinsic_load_view_index:
354
            if (key->has_multiview_view_index)
355
               continue;
356
            def = nir_imm_zero(&b, 1, 32);
357
            break;
358
         default:
359
            continue;
360
         }
361

362
         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, def);
363

364
         nir_instr_remove(instr);
365
         progress = true;
366
      }
367
   }
368

369
   return progress;
370
}
371

372
static bool
373
radv_lower_primitive_shading_rate(nir_shader *nir)
374
{
375
   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
376
   bool progress = false;
377

378
   nir_builder b;
379
   nir_builder_init(&b, impl);
380

381
   /* Iterate in reverse order since there should be only one deref store to PRIMITIVE_SHADING_RATE
382
    * after lower_io_to_temporaries for vertex shaders.
383
    */
384
   nir_foreach_block_reverse(block, impl) {
385
      nir_foreach_instr_reverse(instr, block) {
386
         if (instr->type != nir_instr_type_intrinsic)
387
            continue;
388

389
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
390
         if (intr->intrinsic != nir_intrinsic_store_deref)
391
            continue;
392

393
         nir_variable *var = nir_intrinsic_get_var(intr, 0);
394
         if (var->data.mode != nir_var_shader_out ||
395
             var->data.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)
396
            continue;
397

398
         b.cursor = nir_before_instr(instr);
399

400
         nir_ssa_def *val = nir_ssa_for_src(&b, intr->src[1], 1);
401

402
         /* x_rate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */
403
         nir_ssa_def *x_rate = nir_iand(&b, val, nir_imm_int(&b, 12));
404
         x_rate = nir_b2i32(&b, nir_ine(&b, x_rate, nir_imm_int(&b, 0)));
405

406
         /* y_rate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */
407
         nir_ssa_def *y_rate = nir_iand(&b, val, nir_imm_int(&b, 3));
408
         y_rate = nir_b2i32(&b, nir_ine(&b, y_rate, nir_imm_int(&b, 0)));
409

410
         /* Bits [2:3] = VRS rate X
411
          * Bits [4:5] = VRS rate Y
412
          * HW shading rate = (xRate << 2) | (yRate << 4)
413
          */
414
         nir_ssa_def *out = nir_ior(&b, nir_ishl(&b, x_rate, nir_imm_int(&b, 2)),
415
                                        nir_ishl(&b, y_rate, nir_imm_int(&b, 4)));
416

417
         nir_instr_rewrite_src(&intr->instr, &intr->src[1], nir_src_for_ssa(out));
418

419
         progress = true;
420
         if (nir->info.stage == MESA_SHADER_VERTEX)
421
            return progress;
422
      }
423
   }
424

425
   return progress;
426
}
427

428
nir_shader *
429
radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,
430
                           const char *entrypoint_name, gl_shader_stage stage,
431
                           const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags,
432
                           const struct radv_pipeline_layout *layout,
433
                           const struct radv_pipeline_key *key)
434
{
435
   unsigned subgroup_size = 64, ballot_bit_size = 64;
436
   if (key->compute_subgroup_size) {
437
      /* Only compute shaders currently support requiring a
438
       * specific subgroup size.
439
       */
440
      assert(stage == MESA_SHADER_COMPUTE);
441
      subgroup_size = key->compute_subgroup_size;
442
      ballot_bit_size = key->compute_subgroup_size;
443
   }
444

445
   nir_shader *nir;
446

447
   if (module->nir) {
448
      /* Some things such as our meta clear/blit code will give us a NIR
449
       * shader directly.  In that case, we just ignore the SPIR-V entirely
450
       * and just use the NIR shader */
451
      nir = module->nir;
452
      nir->options = &nir_options;
453
      nir_validate_shader(nir, "in internal shader");
454

455
      assert(exec_list_length(&nir->functions) == 1);
456
   } else {
457
      uint32_t *spirv = (uint32_t *)module->data;
458
      assert(module->size % 4 == 0);
459

460
      if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV)
461
         radv_print_spirv(module->data, module->size, stderr);
462

463
      uint32_t num_spec_entries = 0;
464
      struct nir_spirv_specialization *spec_entries = NULL;
465
      if (spec_info && spec_info->mapEntryCount > 0) {
466
         num_spec_entries = spec_info->mapEntryCount;
467
         spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));
468
         for (uint32_t i = 0; i < num_spec_entries; i++) {
469
            VkSpecializationMapEntry entry = spec_info->pMapEntries[i];
470
            const void *data = (uint8_t *)spec_info->pData + entry.offset;
471
            assert((uint8_t *)data + entry.size <=
472
                   (uint8_t *)spec_info->pData + spec_info->dataSize);
473

474
            spec_entries[i].id = spec_info->pMapEntries[i].constantID;
475
            switch (entry.size) {
476
            case 8:
477
               memcpy(&spec_entries[i].value.u64, data, sizeof(uint64_t));
478
               break;
479
            case 4:
480
               memcpy(&spec_entries[i].value.u32, data, sizeof(uint32_t));
481
               break;
482
            case 2:
483
               memcpy(&spec_entries[i].value.u16, data, sizeof(uint16_t));
484
               break;
485
            case 1:
486
               memcpy(&spec_entries[i].value.u8, data, sizeof(uint8_t));
487
               break;
488
            case 0:
489
               /* The Vulkan spec says:
490
                *
491
                *    "For a constantID specialization constant declared in a shader, size must match
492
                *    the byte size of the constantID. If the specialization constant is of type
493
                *    boolean, size must be the byte size of VkBool32."
494
                *
495
                * Therefore, since only scalars can be decorated as specialization constants, we can
496
                * assume that if it doesn't have a size of 1, 2, 4, or 8, any use in a shader would
497
                * be invalid usage.  The spec further says:
498
                *
499
                *    "If a constantID value is not a specialization constant ID used in the shader,
500
                *    that map entry does not affect the behavior of the pipeline."
501
                *
502
                * so we should ignore any invalid specialization constants rather than crash or
503
                * error out when we see one.
504
                */
505
               break;
506
            default:
507
               assert(!"Invalid spec constant size");
508
               break;
509
            }
510
         }
511
      }
512

513
      struct radv_shader_debug_data spirv_debug_data = {
514
         .device = device,
515
         .module = module,
516
      };
517
      const struct spirv_to_nir_options spirv_options = {
518
         .caps =
519
            {
520
               .amd_fragment_mask = true,
521
               .amd_gcn_shader = true,
522
               .amd_image_gather_bias_lod = true,
523
               .amd_image_read_write_lod = true,
524
               .amd_shader_ballot = true,
525
               .amd_shader_explicit_vertex_parameter = true,
526
               .amd_trinary_minmax = true,
527
               .demote_to_helper_invocation = true,
528
               .derivative_group = true,
529
               .descriptor_array_dynamic_indexing = true,
530
               .descriptor_array_non_uniform_indexing = true,
531
               .descriptor_indexing = true,
532
               .device_group = true,
533
               .draw_parameters = true,
534
               .float_controls = true,
535
               .float16 = device->physical_device->rad_info.has_packed_math_16bit,
536
               .float32_atomic_add = true,
537
               .float64 = true,
538
               .geometry_streams = true,
539
               .image_atomic_int64 = true,
540
               .image_ms_array = true,
541
               .image_read_without_format = true,
542
               .image_write_without_format = true,
543
               .int8 = true,
544
               .int16 = true,
545
               .int64 = true,
546
               .int64_atomics = true,
547
               .min_lod = true,
548
               .multiview = true,
549
               .physical_storage_buffer_address = true,
550
               .post_depth_coverage = true,
551
               .runtime_descriptor_array = true,
552
               .shader_clock = true,
553
               .shader_viewport_index_layer = true,
554
               .sparse_residency = true,
555
               .stencil_export = true,
556
               .storage_8bit = true,
557
               .storage_16bit = true,
558
               .storage_image_ms = true,
559
               .subgroup_arithmetic = true,
560
               .subgroup_ballot = true,
561
               .subgroup_basic = true,
562
               .subgroup_quad = true,
563
               .subgroup_shuffle = true,
564
               .subgroup_uniform_control_flow = true,
565
               .subgroup_vote = true,
566
               .tessellation = true,
567
               .transform_feedback = true,
568
               .variable_pointers = true,
569
               .vk_memory_model = true,
570
               .vk_memory_model_device_scope = true,
571
               .fragment_shading_rate = device->physical_device->rad_info.chip_class >= GFX10_3,
572
               .workgroup_memory_explicit_layout = true,
573
            },
574
         .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
575
         .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
576
         .phys_ssbo_addr_format = nir_address_format_64bit_global,
577
         .push_const_addr_format = nir_address_format_logical,
578
         .shared_addr_format = nir_address_format_32bit_offset,
579
         .frag_coord_is_sysval = true,
580
         .use_deref_buffer_array_length = true,
581
         .debug =
582
            {
583
               .func = radv_spirv_nir_debug,
584
               .private_data = &spirv_debug_data,
585
            },
586
      };
587
      nir = spirv_to_nir(spirv, module->size / 4, spec_entries, num_spec_entries, stage,
588
                         entrypoint_name, &spirv_options, &nir_options);
589
      assert(nir->info.stage == stage);
590
      nir_validate_shader(nir, "after spirv_to_nir");
591

592
      free(spec_entries);
593

594
      /* We have to lower away local constant initializers right before we
595
       * inline functions.  That way they get properly initialized at the top
596
       * of the function and not at the top of its caller.
597
       */
598
      NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
599
      NIR_PASS_V(nir, nir_lower_returns);
600
      NIR_PASS_V(nir, nir_inline_functions);
601
      NIR_PASS_V(nir, nir_copy_prop);
602
      NIR_PASS_V(nir, nir_opt_deref);
603

604
      /* Pick off the single entrypoint that we want */
605
      foreach_list_typed_safe(nir_function, func, node, &nir->functions)
606
      {
607
         if (func->is_entrypoint)
608
            func->name = ralloc_strdup(func, "main");
609
         else
610
            exec_node_remove(&func->node);
611
      }
612
      assert(exec_list_length(&nir->functions) == 1);
613

614
      /* Make sure we lower constant initializers on output variables so that
615
       * nir_remove_dead_variables below sees the corresponding stores
616
       */
617
      NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);
618

619
      /* Now that we've deleted all but the main function, we can go ahead and
620
       * lower the rest of the constant initializers.
621
       */
622
      NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
623

624
      /* Split member structs.  We do this before lower_io_to_temporaries so that
625
       * it doesn't lower system values to temporaries by accident.
626
       */
627
      NIR_PASS_V(nir, nir_split_var_copies);
628
      NIR_PASS_V(nir, nir_split_per_member_structs);
629

630
      if (nir->info.stage == MESA_SHADER_FRAGMENT)
631
         NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
632
      if (nir->info.stage == MESA_SHADER_FRAGMENT)
633
         NIR_PASS_V(nir, nir_lower_input_attachments,
634
                    &(nir_input_attachment_options){
635
                       .use_fragcoord_sysval = true,
636
                       .use_layer_id_sysval = false,
637
                    });
638

639
      NIR_PASS_V(nir, nir_remove_dead_variables,
640
                 nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
641
                 NULL);
642

643
      /* Variables can make nir_propagate_invariant more conservative
644
       * than it needs to be.
645
       */
646
      NIR_PASS_V(nir, nir_lower_global_vars_to_local);
647
      NIR_PASS_V(nir, nir_lower_vars_to_ssa);
648

649
      NIR_PASS_V(nir, nir_propagate_invariant,
650
                 device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM);
651

652
      NIR_PASS_V(nir, nir_lower_system_values);
653
      NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
654

655
      NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
656

657
      NIR_PASS_V(nir, nir_lower_discard_or_demote,
658
                 device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE);
659

660
      nir_lower_doubles_options lower_doubles = nir->options->lower_doubles_options;
661

662
      if (device->physical_device->rad_info.chip_class == GFX6) {
663
         /* GFX6 doesn't support v_floor_f64 and the precision
664
          * of v_fract_f64 which is used to implement 64-bit
665
          * floor is less than what Vulkan requires.
666
          */
667
         lower_doubles |= nir_lower_dfloor;
668
      }
669

670
      NIR_PASS_V(nir, nir_lower_doubles, NULL, lower_doubles);
671
   }
672

673
   /* Vulkan uses the separate-shader linking model */
674
   nir->info.separate_shader = true;
675

676
   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
677

678
   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
679
      unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;
680

681
      if (device->physical_device->use_ngg && !radv_use_llvm_for_stage(device, stage)) {
682
         /* ACO needs NIR to do some of the hard lifting */
683
         nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |
684
                         nir_lower_gs_intrinsics_count_vertices_per_primitive |
685
                         nir_lower_gs_intrinsics_overwrite_incomplete;
686
      }
687

688
      nir_lower_gs_intrinsics(nir, nir_gs_flags);
689
   }
690

691
   static const nir_lower_tex_options tex_options = {
692
      .lower_txp = ~0,
693
      .lower_tg4_offsets = true,
694
   };
695

696
   nir_lower_tex(nir, &tex_options);
697

698
   nir_lower_vars_to_ssa(nir);
699

700
   if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_GEOMETRY ||
701
       nir->info.stage == MESA_SHADER_FRAGMENT) {
702
      NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true);
703
   } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
704
      NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, false);
705
   }
706

707
   nir_split_var_copies(nir);
708

709
   nir_lower_global_vars_to_local(nir);
710
   nir_remove_dead_variables(nir, nir_var_function_temp, NULL);
711
   bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7;
712
   nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options){
713
                               .subgroup_size = subgroup_size,
714
                               .ballot_bit_size = ballot_bit_size,
715
                               .ballot_components = 1,
716
                               .lower_to_scalar = 1,
717
                               .lower_subgroup_masks = 1,
718
                               .lower_shuffle = 1,
719
                               .lower_shuffle_to_32bit = 1,
720
                               .lower_vote_eq = 1,
721
                               .lower_quad_broadcast_dynamic = 1,
722
                               .lower_quad_broadcast_dynamic_to_const = gfx7minus,
723
                               .lower_shuffle_to_swizzle_amd = 1,
724
                               .lower_elect = radv_use_llvm_for_stage(device, stage),
725
                            });
726

727
   nir_lower_load_const_to_scalar(nir);
728

729
   if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
730
      radv_optimize_nir(device, nir, false, true);
731

732
   /* call radv_nir_lower_ycbcr_textures() late as there might still be
733
    * tex with undef texture/sampler before first optimization */
734
   NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout);
735

736
   /* We call nir_lower_var_copies() after the first radv_optimize_nir()
737
    * to remove any copies introduced by nir_opt_find_array_copies().
738
    */
739
   nir_lower_var_copies(nir);
740

741
   const nir_opt_access_options opt_access_options = {
742
      .is_vulkan = true,
743
      .infer_non_readable = true,
744
   };
745
   NIR_PASS_V(nir, nir_opt_access, &opt_access_options);
746

747
   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset);
748

749
   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo | nir_var_mem_ssbo,
750
              nir_address_format_vec2_index_32bit_offset);
751

752
   NIR_PASS_V(nir, lower_intrinsics, key, layout, device->physical_device);
753

754
   /* Lower deref operations for compute shared memory. */
755
   if (nir->info.stage == MESA_SHADER_COMPUTE) {
756
      if (!nir->info.shared_memory_explicit_layout) {
757
         NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info);
758
      }
759
      NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
760

761
      if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
762
         const unsigned chunk_size = 16; /* max single store size */
763
         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
764
         NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
765
      }
766
   }
767

768
   nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
769

770
   /* Lower large variables that are always constant with load_constant
771
    * intrinsics, which get turned into PC-relative loads from a data
772
    * section next to the shader.
773
    */
774
   NIR_PASS_V(nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
775

776
   /* Lower primitive shading rate to match HW requirements. */
777
   if ((nir->info.stage == MESA_SHADER_VERTEX ||
778
        nir->info.stage == MESA_SHADER_GEOMETRY) &&
779
       nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
780
      NIR_PASS_V(nir, radv_lower_primitive_shading_rate);
781
   }
782

783
   /* Indirect lowering must be called after the radv_optimize_nir() loop
784
    * has been called at least once. Otherwise indirect lowering can
785
    * bloat the instruction count of the loop and cause it to be
786
    * considered too large for unrolling.
787
    */
788
   if (ac_nir_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class) &&
789
       !(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) &&
790
       nir->info.stage != MESA_SHADER_COMPUTE) {
791
      /* Optimize the lowered code before the linking optimizations. */
792
      radv_optimize_nir(device, nir, false, false);
793
   }
794

795
   return nir;
796
}
797

798
static int
799
type_size_vec4(const struct glsl_type *type, bool bindless)
800
{
801
   return glsl_count_attribute_slots(type, false);
802
}
803

804
static nir_variable *
805
find_layer_in_var(nir_shader *nir)
806
{
807
   nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_LAYER);
808
   if (var != NULL)
809
      return var;
810

811
   var = nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id");
812
   var->data.location = VARYING_SLOT_LAYER;
813
   var->data.interpolation = INTERP_MODE_FLAT;
814
   return var;
815
}
816

817
/* We use layered rendering to implement multiview, which means we need to map
818
 * view_index to gl_Layer. The code generates a load from the layer_id sysval,
819
 * but since we don't have a way to get at this information from the fragment
820
 * shader, we also need to lower this to the gl_Layer varying.  This pass
821
 * lowers both to a varying load from the LAYER slot, before lowering io, so
822
 * that nir_assign_var_locations() will give the LAYER varying the correct
823
 * driver_location.
824
 */
825

826
static bool
827
lower_view_index(nir_shader *nir)
828
{
829
   bool progress = false;
830
   nir_function_impl *entry = nir_shader_get_entrypoint(nir);
831
   nir_builder b;
832
   nir_builder_init(&b, entry);
833

834
   nir_variable *layer = NULL;
835
   nir_foreach_block (block, entry) {
836
      nir_foreach_instr_safe (instr, block) {
837
         if (instr->type != nir_instr_type_intrinsic)
838
            continue;
839

840
         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
841
         if (load->intrinsic != nir_intrinsic_load_view_index)
842
            continue;
843

844
         if (!layer)
845
            layer = find_layer_in_var(nir);
846

847
         b.cursor = nir_before_instr(instr);
848
         nir_ssa_def *def = nir_load_var(&b, layer);
849
         nir_ssa_def_rewrite_uses(&load->dest.ssa, def);
850

851
         nir_instr_remove(instr);
852
         progress = true;
853
      }
854
   }
855

856
   return progress;
857
}
858

859
void
860
radv_lower_io(struct radv_device *device, nir_shader *nir)
861
{
862
   if (nir->info.stage == MESA_SHADER_COMPUTE)
863
      return;
864

865
   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
866
      NIR_PASS_V(nir, lower_view_index);
867
      nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT);
868
   }
869

870
   /* The RADV/LLVM backend expects 64-bit IO to be lowered. */
871
   nir_lower_io_options options =
872
      radv_use_llvm_for_stage(device, nir->info.stage) ? nir_lower_io_lower_64bit_to_32 : 0;
873

874
   NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, options);
875

876
   /* This pass needs actual constants */
877
   nir_opt_constant_folding(nir);
878

879
   NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
880
}
881

882
bool
883
radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
884
                     struct radv_shader_info *info, const struct radv_pipeline_key *pl_key)
885
{
886
   if (nir->info.stage == MESA_SHADER_VERTEX) {
887
      if (info->vs.as_ls) {
888
         ac_nir_lower_ls_outputs_to_mem(nir, info->vs.tcs_in_out_eq,
889
                                        info->vs.tcs_temp_only_input_mask,
890
                                        info->vs.num_linked_outputs);
891
         return true;
892
      } else if (info->vs.as_es) {
893
         ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,
894
                                        info->vs.num_linked_outputs);
895
         return true;
896
      }
897
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
898
      ac_nir_lower_hs_inputs_to_mem(nir, info->vs.tcs_in_out_eq, info->tcs.num_linked_inputs);
899
      ac_nir_lower_hs_outputs_to_mem(
900
         nir, device->physical_device->rad_info.chip_class, info->tcs.tes_reads_tess_factors,
901
         info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->tcs.num_linked_inputs,
902
         info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs, true);
903
      ac_nir_lower_tess_to_const(nir, pl_key->tess_input_vertices, info->num_tess_patches,
904
                                 ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);
905

906
      return true;
907
   } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
908
      ac_nir_lower_tes_inputs_to_mem(nir, info->tes.num_linked_inputs,
909
                                     info->tes.num_linked_patch_inputs);
910
      ac_nir_lower_tess_to_const(nir, nir->info.tess.tcs_vertices_out, info->num_tess_patches,
911
                                 ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);
912

913
      if (info->tes.as_es) {
914
         ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,
915
                                        info->tes.num_linked_outputs);
916
      }
917

918
      return true;
919
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
920
      ac_nir_lower_gs_inputs_to_mem(nir, device->physical_device->rad_info.chip_class,
921
                                    info->gs.num_linked_inputs);
922
      return true;
923
   }
924

925
   return false;
926
}
927

928
bool
929
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
930
                      uint64_t ps_inputs_read)
931
{
932
   /* Culling doesn't make sense for meta shaders. */
933
   if (!!nir->info.name)
934
      return false;
935

936
   /* We don't support culling with multiple viewports yet. */
937
   if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))
938
      return false;
939

940
   /* TODO: enable by default on GFX10.3 when we're confident about performance. */
941
   bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
942

943
   if (!culling_enabled)
944
      return false;
945

946
   /* Shader based culling efficiency can depend on PS throughput.
947
    * Estimate an upper limit for PS input param count based on GPU info.
948
    */
949
   unsigned max_ps_params;
950
   unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
951
   unsigned max_se = device->physical_device->rad_info.max_se;
952

953
   if (max_render_backends < 2)
954
      return false; /* Don't use NGG culling on 1 RB chips. */
955
   else if (max_render_backends / max_se == 4)
956
      max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
957
   else
958
      max_ps_params = 4; /* Navi 1x. */
959

960
   /* TODO: consider other heuristics here, such as PS execution time */
961

962
   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
963
}
964

965
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
966
                    struct radv_shader_info *info,
967
                    const struct radv_pipeline_key *pl_key,
968
                    struct radv_shader_variant_key *key,
969
                    bool consider_culling)
970
{
971
   /* TODO: support the LLVM backend with the NIR lowering */
972
   assert(!radv_use_llvm_for_stage(device, nir->info.stage));
973

974
   assert(nir->info.stage == MESA_SHADER_VERTEX ||
975
          nir->info.stage == MESA_SHADER_TESS_EVAL ||
976
          nir->info.stage == MESA_SHADER_GEOMETRY);
977

978
   ac_nir_ngg_config out_conf = {0};
979
   const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
980
   unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
981
   unsigned num_vertices_per_prim = 3;
982

983
   /* Get the number of vertices per input primitive */
984
   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
985
      if (nir->info.tess.point_mode)
986
         num_vertices_per_prim = 1;
987
      else if (nir->info.tess.primitive_mode == GL_ISOLINES)
988
         num_vertices_per_prim = 2;
989

990
      /* Manually mark the primitive ID used, so the shader can repack it. */
991
      if (key->vs_common_out.export_prim_id)
992
         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
993

994
   } else if (nir->info.stage == MESA_SHADER_VERTEX) {
995
      /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
996
      num_vertices_per_prim = key->vs.outprim + 1;
997

998
      /* Manually mark the instance ID used, so the shader can repack it. */
999
      if (key->vs.instance_rate_inputs)
1000
         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1001

1002
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1003
      num_vertices_per_prim = nir->info.gs.vertices_in;
1004
   } else {
1005
      unreachable("NGG needs to be VS, TES or GS.");
1006
   }
1007

1008
   /* Invocations that process an input vertex */
1009
   unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
1010
   /* Invocations that export an output vertex */
1011
   unsigned max_vtx_out = ngg_info->max_out_verts;
1012
   /* Invocations that process an input primitive */
1013
   unsigned max_prm_in = ngg_info->max_gsprims * num_gs_invocations;
1014
   /* Invocations that produce an output primitive */
1015
   unsigned max_prm_out = ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor;
1016

1017
   unsigned max_workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prm_in, max_prm_out);
1018

1019
   /* Maximum HW limit for NGG workgroups */
1020
   max_workgroup_size = MIN2(256, max_workgroup_size);
1021

1022
   if (nir->info.stage == MESA_SHADER_VERTEX ||
1023
       nir->info.stage == MESA_SHADER_TESS_EVAL) {
1024
      assert(key->vs_common_out.as_ngg);
1025

1026
      if (consider_culling)
1027
         radv_optimize_nir_algebraic(nir, false);
1028

1029
      out_conf =
1030
         ac_nir_lower_ngg_nogs(
1031
            nir,
1032
            max_vtx_in,
1033
            num_vertices_per_prim,
1034
            max_workgroup_size,
1035
            info->wave_size,
1036
            consider_culling,
1037
            key->vs_common_out.as_ngg_passthrough,
1038
            key->vs_common_out.export_prim_id,
1039
            key->vs.provoking_vtx_last);
1040

1041
      info->has_ngg_culling = out_conf.can_cull;
1042
      info->has_ngg_early_prim_export = out_conf.early_prim_export;
1043
      info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
1044
      info->is_ngg_passthrough = out_conf.passthrough;
1045
      key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
1046
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1047
      assert(info->is_ngg);
1048
      ac_nir_lower_ngg_gs(
1049
         nir, info->wave_size, max_workgroup_size,
1050
         info->ngg_info.esgs_ring_size,
1051
         info->gs.gsvs_vertex_size,
1052
         info->ngg_info.ngg_emit_size * 4u,
1053
         key->vs.provoking_vtx_last);
1054
   } else {
1055
      unreachable("invalid SW stage passed to radv_lower_ngg");
1056
   }
1057
}
1058

1059
static void *
1060
radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
1061
{
1062
   mtx_lock(&device->shader_slab_mutex);
1063
   list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
1064
   {
1065
      uint64_t offset = 0;
1066

1067
#ifdef __GNUC__
1068
#pragma GCC diagnostic push
1069
#pragma GCC diagnostic ignored "-Wshadow"
1070
#endif
1071
      list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
1072
      {
1073
#ifdef __GNUC__
1074
#pragma GCC diagnostic pop
1075
#endif
1076
         if (s->bo_offset - offset >= shader->code_size) {
1077
            shader->bo = slab->bo;
1078
            shader->bo_offset = offset;
1079
            list_addtail(&shader->slab_list, &s->slab_list);
1080
            mtx_unlock(&device->shader_slab_mutex);
1081
            return slab->ptr + offset;
1082
         }
1083
         offset = align_u64(s->bo_offset + s->code_size, 256);
1084
      }
1085
      if (offset <= slab->size && slab->size - offset >= shader->code_size) {
1086
         shader->bo = slab->bo;
1087
         shader->bo_offset = offset;
1088
         list_addtail(&shader->slab_list, &slab->shaders);
1089
         mtx_unlock(&device->shader_slab_mutex);
1090
         return slab->ptr + offset;
1091
      }
1092
   }
1093

1094
   mtx_unlock(&device->shader_slab_mutex);
1095
   struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));
1096

1097
   slab->size = MAX2(256 * 1024, shader->code_size);
1098
   VkResult result = device->ws->buffer_create(
1099
      device->ws, slab->size, 256, RADEON_DOMAIN_VRAM,
1100
      RADEON_FLAG_NO_INTERPROCESS_SHARING |
1101
         (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
1102
                                                                         : RADEON_FLAG_READ_ONLY),
1103
      RADV_BO_PRIORITY_SHADER, 0, &slab->bo);
1104
   if (result != VK_SUCCESS) {
1105
      free(slab);
1106
      return NULL;
1107
   }
1108

1109
   slab->ptr = (char *)device->ws->buffer_map(slab->bo);
1110
   if (!slab->ptr) {
1111
      device->ws->buffer_destroy(device->ws, slab->bo);
1112
      free(slab);
1113
      return NULL;
1114
   }
1115

1116
   list_inithead(&slab->shaders);
1117

1118
   mtx_lock(&device->shader_slab_mutex);
1119
   list_add(&slab->slabs, &device->shader_slabs);
1120

1121
   shader->bo = slab->bo;
1122
   shader->bo_offset = 0;
1123
   list_add(&shader->slab_list, &slab->shaders);
1124
   mtx_unlock(&device->shader_slab_mutex);
1125
   return slab->ptr;
1126
}
1127

1128
void
1129
radv_destroy_shader_slabs(struct radv_device *device)
1130
{
1131
   list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
1132
   {
1133
      device->ws->buffer_destroy(device->ws, slab->bo);
1134
      free(slab);
1135
   }
1136
   mtx_destroy(&device->shader_slab_mutex);
1137
}
1138

1139
/* For the UMR disassembler. */
1140
#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
1141
#define DEBUGGER_NUM_MARKERS        5
1142

1143
static unsigned
1144
radv_get_shader_binary_size(size_t code_size)
1145
{
1146
   return code_size + DEBUGGER_NUM_MARKERS * 4;
1147
}
1148

1149
static bool
1150
radv_should_use_wgp_mode(const struct radv_device *device, gl_shader_stage stage,
1151
                         const struct radv_shader_info *info)
1152
{
1153
   enum chip_class chip = device->physical_device->rad_info.chip_class;
1154
   switch (stage) {
1155
   case MESA_SHADER_COMPUTE:
1156
   case MESA_SHADER_TESS_CTRL:
1157
      return chip >= GFX10;
1158
   case MESA_SHADER_GEOMETRY:
1159
      return chip == GFX10 || (chip >= GFX10_3 && !info->is_ngg);
1160
   case MESA_SHADER_VERTEX:
1161
   case MESA_SHADER_TESS_EVAL:
1162
      return chip == GFX10 && info->is_ngg;
1163
   default:
1164
      return false;
1165
   }
1166
}
1167

1168
static void
1169
radv_postprocess_config(const struct radv_device *device, const struct ac_shader_config *config_in,
1170
                        const struct radv_shader_info *info, gl_shader_stage stage,
1171
                        struct ac_shader_config *config_out)
1172
{
1173
   const struct radv_physical_device *pdevice = device->physical_device;
1174
   bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
1175
   bool trap_enabled = !!device->trap_handler_shader;
1176
   unsigned vgpr_comp_cnt = 0;
1177
   unsigned num_input_vgprs = info->num_input_vgprs;
1178

1179
   if (stage == MESA_SHADER_FRAGMENT) {
1180
      num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL);
1181
   }
1182

1183
   unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);
1184
   /* +3 for scratch wave offset and VCC */
1185
   unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);
1186
   unsigned num_shared_vgprs = config_in->num_shared_vgprs;
1187
   /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */
1188
   assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) ||
1189
          (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));
1190
   unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;
1191
   unsigned excp_en = 0;
1192

1193
   *config_out = *config_in;
1194
   config_out->num_vgprs = num_vgprs;
1195
   config_out->num_sgprs = num_sgprs;
1196
   config_out->num_shared_vgprs = num_shared_vgprs;
1197

1198
   config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) |
1199
                       S_00B12C_SCRATCH_EN(scratch_enabled) | S_00B12C_TRAP_PRESENT(trap_enabled);
1200

1201
   if (trap_enabled) {
1202
      /* Configure the shader exceptions like memory violation, etc.
1203
       * TODO: Enable (and validate) more exceptions.
1204
       */
1205
      excp_en = 1 << 8; /* mem_viol */
1206
   }
1207

1208
   if (!pdevice->use_ngg_streamout) {
1209
      config_out->rsrc2 |=
1210
         S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |
1211
         S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |
1212
         S_00B12C_SO_EN(!!info->so.num_outputs);
1213
   }
1214

1215
   config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / (info->wave_size == 32 ? 8 : 4)) |
1216
                       S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode);
1217

1218
   if (pdevice->rad_info.chip_class >= GFX10) {
1219
      config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5);
1220
   } else {
1221
      config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8);
1222
      config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5);
1223
   }
1224

1225
   bool wgp_mode = radv_should_use_wgp_mode(device, stage, info);
1226

1227
   switch (stage) {
1228
   case MESA_SHADER_TESS_EVAL:
1229
      if (info->is_ngg) {
1230
         config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1231
         config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1) | S_00B22C_EXCP_EN(excp_en);
1232
      } else if (info->tes.as_es) {
1233
         assert(pdevice->rad_info.chip_class <= GFX8);
1234
         vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
1235

1236
         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1237
      } else {
1238
         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
1239
         vgpr_comp_cnt = enable_prim_id ? 3 : 2;
1240

1241
         config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1242
         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1243
      }
1244
      config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1245
      break;
1246
   case MESA_SHADER_TESS_CTRL:
1247
      if (pdevice->rad_info.chip_class >= GFX9) {
1248
         /* We need at least 2 components for LS.
1249
          * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
1250
          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1251
          */
1252
         if (pdevice->rad_info.chip_class >= GFX10) {
1253
            vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1;
1254
            config_out->rsrc2 |=
1255
               S_00B42C_LDS_SIZE_GFX10(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX6(excp_en);
1256
         } else {
1257
            vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
1258
            config_out->rsrc2 |=
1259
               S_00B42C_LDS_SIZE_GFX9(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX9(excp_en);
1260
         }
1261
      } else {
1262
         config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1263
      }
1264
      config_out->rsrc1 |=
1265
         S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B428_WGP_MODE(wgp_mode);
1266
      config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1267
      break;
1268
   case MESA_SHADER_VERTEX:
1269
      if (info->is_ngg) {
1270
         config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1271
      } else if (info->vs.as_ls) {
1272
         assert(pdevice->rad_info.chip_class <= GFX8);
1273
         /* We need at least 2 components for LS.
1274
          * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
1275
          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1276
          */
1277
         vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
1278
      } else if (info->vs.as_es) {
1279
         assert(pdevice->rad_info.chip_class <= GFX8);
1280
         /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
1281
         vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0;
1282
      } else {
1283
         /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
1284
          * If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
1285
          * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1286
          */
1287
         if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {
1288
            vgpr_comp_cnt = 3;
1289
         } else if (info->vs.export_prim_id) {
1290
            vgpr_comp_cnt = 2;
1291
         } else if (info->vs.needs_instance_id) {
1292
            vgpr_comp_cnt = 1;
1293
         } else {
1294
            vgpr_comp_cnt = 0;
1295
         }
1296

1297
         config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1298
      }
1299
      config_out->rsrc2 |=
1300
         S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B12C_EXCP_EN(excp_en);
1301
      break;
1302
   case MESA_SHADER_FRAGMENT:
1303
      config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1304
      config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
1305
                           S_00B02C_TRAP_PRESENT(1) | S_00B02C_EXCP_EN(excp_en);
1306
      break;
1307
   case MESA_SHADER_GEOMETRY:
1308
      config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1309
      config_out->rsrc2 |=
1310
         S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B22C_EXCP_EN(excp_en);
1311
      break;
1312
   case MESA_SHADER_COMPUTE:
1313
      config_out->rsrc1 |=
1314
         S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(wgp_mode);
1315
      config_out->rsrc2 |= S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
1316
                           S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
1317
                           S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) |
1318
                           S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2]   ? 2
1319
                                                   : info->cs.uses_thread_id[1] ? 1
1320
                                                                                : 0) |
1321
                           S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
1322
                           S_00B84C_LDS_SIZE(config_in->lds_size) | S_00B84C_EXCP_EN(excp_en);
1323
      config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1324

1325
      break;
1326
   default:
1327
      unreachable("unsupported shader type");
1328
      break;
1329
   }
1330

1331
   if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg &&
1332
       (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL ||
1333
        stage == MESA_SHADER_GEOMETRY)) {
1334
      unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
1335
      gl_shader_stage es_stage = stage;
1336
      if (stage == MESA_SHADER_GEOMETRY)
1337
         es_stage = info->gs.es_type;
1338

1339
      /* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
1340
      if (es_stage == MESA_SHADER_VERTEX) {
1341
         es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;
1342
      } else if (es_stage == MESA_SHADER_TESS_EVAL) {
1343
         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
1344
         es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;
1345
      } else
1346
         unreachable("Unexpected ES shader stage");
1347

1348
      bool tes_triangles =
1349
         stage == MESA_SHADER_TESS_EVAL && info->tes.primitive_mode >= 4; /* GL_TRIANGLES */
1350
      if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {
1351
         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1352
      } else if (info->uses_prim_id) {
1353
         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1354
      } else if (info->gs.vertices_in >= 3 || tes_triangles) {
1355
         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1356
      } else {
1357
         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1358
      }
1359

1360
      /* Disable the WGP mode on gfx10.3 because it can hang. (it
1361
       * happened on VanGogh) Let's disable it on all chips that
1362
       * disable exactly 1 CU per SA for GS.
1363
       */
1364
      config_out->rsrc1 |=
1365
         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);
1366
      config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1367
                           S_00B22C_LDS_SIZE(config_in->lds_size) |
1368
                           S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);
1369
   } else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_GEOMETRY) {
1370
      unsigned es_type = info->gs.es_type;
1371
      unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
1372

1373
      if (es_type == MESA_SHADER_VERTEX) {
1374
         /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
1375
         if (info->vs.needs_instance_id) {
1376
            es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1;
1377
         } else {
1378
            es_vgpr_comp_cnt = 0;
1379
         }
1380
      } else if (es_type == MESA_SHADER_TESS_EVAL) {
1381
         es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
1382
      } else {
1383
         unreachable("invalid shader ES type");
1384
      }
1385

1386
      /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
1387
       * VGPR[0:4] are always loaded.
1388
       */
1389
      if (info->uses_invocation_id) {
1390
         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1391
      } else if (info->uses_prim_id) {
1392
         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1393
      } else if (info->gs.vertices_in >= 3) {
1394
         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1395
      } else {
1396
         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1397
      }
1398

1399
      config_out->rsrc1 |=
1400
         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);
1401
      config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1402
                           S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
1403
   } else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_TESS_CTRL) {
1404
      config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
1405
   } else {
1406
      config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
1407
   }
1408
}
1409

1410
struct radv_shader_variant *
1411
radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary,
1412
                           bool keep_shader_info)
1413
{
1414
   struct ac_shader_config config = {0};
1415
   struct ac_rtld_binary rtld_binary = {0};
1416
   struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));
1417
   if (!variant)
1418
      return NULL;
1419

1420
   variant->ref_count = 1;
1421

1422
   if (binary->type == RADV_BINARY_TYPE_RTLD) {
1423
      struct ac_rtld_symbol lds_symbols[2];
1424
      unsigned num_lds_symbols = 0;
1425
      const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data;
1426
      size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size;
1427

1428
      if (device->physical_device->rad_info.chip_class >= GFX9 &&
1429
          (binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) &&
1430
          !binary->is_gs_copy_shader) {
1431
         struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1432
         sym->name = "esgs_ring";
1433
         sym->size = binary->info.ngg_info.esgs_ring_size;
1434
         sym->align = 64 * 1024;
1435
      }
1436

1437
      if (binary->info.is_ngg && binary->stage == MESA_SHADER_GEOMETRY) {
1438
         struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1439
         sym->name = "ngg_emit";
1440
         sym->size = binary->info.ngg_info.ngg_emit_size * 4;
1441
         sym->align = 4;
1442
      }
1443

1444
      struct ac_rtld_open_info open_info = {
1445
         .info = &device->physical_device->rad_info,
1446
         .shader_type = binary->stage,
1447
         .wave_size = binary->info.wave_size,
1448
         .num_parts = 1,
1449
         .elf_ptrs = &elf_data,
1450
         .elf_sizes = &elf_size,
1451
         .num_shared_lds_symbols = num_lds_symbols,
1452
         .shared_lds_symbols = lds_symbols,
1453
      };
1454

1455
      if (!ac_rtld_open(&rtld_binary, open_info)) {
1456
         free(variant);
1457
         return NULL;
1458
      }
1459

1460
      if (!ac_rtld_read_config(&device->physical_device->rad_info, &rtld_binary, &config)) {
1461
         ac_rtld_close(&rtld_binary);
1462
         free(variant);
1463
         return NULL;
1464
      }
1465

1466
      if (rtld_binary.lds_size > 0) {
1467
         unsigned encode_granularity = device->physical_device->rad_info.lds_encode_granularity;
1468
         config.lds_size = align(rtld_binary.lds_size, encode_granularity) / encode_granularity;
1469
      }
1470
      if (!config.lds_size && binary->stage == MESA_SHADER_TESS_CTRL) {
1471
         /* This is used for reporting LDS statistics */
1472
         config.lds_size = binary->info.tcs.num_lds_blocks;
1473
      }
1474

1475
      variant->code_size = rtld_binary.rx_size;
1476
      variant->exec_size = rtld_binary.exec_size;
1477
   } else {
1478
      assert(binary->type == RADV_BINARY_TYPE_LEGACY);
1479
      config = ((struct radv_shader_binary_legacy *)binary)->config;
1480
      variant->code_size =
1481
         radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);
1482
      variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;
1483
   }
1484

1485
   variant->info = binary->info;
1486
   radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config);
1487

1488
   void *dest_ptr = radv_alloc_shader_memory(device, variant);
1489
   if (!dest_ptr) {
1490
      if (binary->type == RADV_BINARY_TYPE_RTLD)
1491
         ac_rtld_close(&rtld_binary);
1492
      free(variant);
1493
      return NULL;
1494
   }
1495

1496
   if (binary->type == RADV_BINARY_TYPE_RTLD) {
1497
      struct radv_shader_binary_rtld *bin = (struct radv_shader_binary_rtld *)binary;
1498
      struct ac_rtld_upload_info info = {
1499
         .binary = &rtld_binary,
1500
         .rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,
1501
         .rx_ptr = dest_ptr,
1502
      };
1503

1504
      if (!ac_rtld_upload(&info)) {
1505
         radv_shader_variant_destroy(device, variant);
1506
         ac_rtld_close(&rtld_binary);
1507
         return NULL;
1508
      }
1509

1510
      if (keep_shader_info || (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) {
1511
         const char *disasm_data;
1512
         size_t disasm_size;
1513
         if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data,
1514
                                          &disasm_size)) {
1515
            radv_shader_variant_destroy(device, variant);
1516
            ac_rtld_close(&rtld_binary);
1517
            return NULL;
1518
         }
1519

1520
         variant->ir_string =
1521
            bin->llvm_ir_size ? strdup((const char *)(bin->data + bin->elf_size)) : NULL;
1522
         variant->disasm_string = malloc(disasm_size + 1);
1523
         memcpy(variant->disasm_string, disasm_data, disasm_size);
1524
         variant->disasm_string[disasm_size] = 0;
1525
      }
1526

1527
      variant->code_ptr = dest_ptr;
1528
      ac_rtld_close(&rtld_binary);
1529
   } else {
1530
      struct radv_shader_binary_legacy *bin = (struct radv_shader_binary_legacy *)binary;
1531
      memcpy(dest_ptr, bin->data + bin->stats_size, bin->code_size);
1532

1533
      /* Add end-of-code markers for the UMR disassembler. */
1534
      uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4;
1535
      for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
1536
         ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
1537

1538
      variant->code_ptr = dest_ptr;
1539
      variant->ir_string =
1540
         bin->ir_size ? strdup((const char *)(bin->data + bin->stats_size + bin->code_size)) : NULL;
1541
      variant->disasm_string =
1542
         bin->disasm_size
1543
            ? strdup((const char *)(bin->data + bin->stats_size + bin->code_size + bin->ir_size))
1544
            : NULL;
1545

1546
      if (bin->stats_size) {
1547
         variant->statistics = calloc(bin->stats_size, 1);
1548
         memcpy(variant->statistics, bin->data, bin->stats_size);
1549
      }
1550
   }
1551
   return variant;
1552
}
1553

1554
static char *
1555
radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)
1556
{
1557
   char *data = NULL;
1558
   char *ret = NULL;
1559
   size_t size = 0;
1560
   struct u_memstream mem;
1561
   if (u_memstream_open(&mem, &data, &size)) {
1562
      FILE *const memf = u_memstream_get(&mem);
1563
      for (int i = 0; i < shader_count; ++i)
1564
         nir_print_shader(shaders[i], memf);
1565
      u_memstream_close(&mem);
1566
   }
1567

1568
   ret = malloc(size + 1);
1569
   if (ret) {
1570
      memcpy(ret, data, size);
1571
      ret[size] = 0;
1572
   }
1573
   free(data);
1574
   return ret;
1575
}
1576

1577
static struct radv_shader_variant *
1578
shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,
1579
                       struct nir_shader *const *shaders, int shader_count, gl_shader_stage stage,
1580
                       struct radv_shader_info *info, struct radv_nir_compiler_options *options,
1581
                       bool gs_copy_shader, bool trap_handler_shader, bool keep_shader_info,
1582
                       bool keep_statistic_info, struct radv_shader_binary **binary_out)
1583
{
1584
   enum radeon_family chip_family = device->physical_device->rad_info.family;
1585
   struct radv_shader_binary *binary = NULL;
1586

1587
   struct radv_shader_debug_data debug_data = {
1588
      .device = device,
1589
      .module = module,
1590
   };
1591

1592
   options->family = chip_family;
1593
   options->chip_class = device->physical_device->rad_info.chip_class;
1594
   options->info = &device->physical_device->rad_info;
1595
   options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader || trap_handler_shader);
1596
   options->dump_preoptir =
1597
      options->dump_shader && device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
1598
   options->record_ir = keep_shader_info;
1599
   options->record_stats = keep_statistic_info;
1600
   options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
1601
   options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
1602
   options->address32_hi = device->physical_device->rad_info.address32_hi;
1603
   options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;
1604
   options->use_ngg_streamout = device->physical_device->use_ngg_streamout;
1605
   options->enable_mrt_output_nan_fixup =
1606
      module && !module->nir && device->instance->enable_mrt_output_nan_fixup;
1607
   options->adjust_frag_coord_z = device->adjust_frag_coord_z;
1608
   options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
1609
   options->debug.func = radv_compiler_debug;
1610
   options->debug.private_data = &debug_data;
1611

1612
   switch (device->force_vrs) {
1613
   case RADV_FORCE_VRS_2x2:
1614
      options->force_vrs_rates = (1u << 2) | (1u << 4);
1615
      break;
1616
   case RADV_FORCE_VRS_2x1:
1617
      options->force_vrs_rates = (1u << 2) | (0u << 4);
1618
      break;
1619
   case RADV_FORCE_VRS_1x2:
1620
      options->force_vrs_rates = (0u << 2) | (1u << 4);
1621
      break;
1622
   default:
1623
      break;
1624
   }
1625

1626
   struct radv_shader_args args = {0};
1627
   args.options = options;
1628
   args.shader_info = info;
1629
   args.is_gs_copy_shader = gs_copy_shader;
1630
   args.is_trap_handler_shader = trap_handler_shader;
1631

1632
   radv_declare_shader_args(
1633
      &args, gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage,
1634
      shader_count >= 2,
1635
      shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);
1636

1637
   if (radv_use_llvm_for_stage(device, stage) || options->dump_shader || options->record_ir)
1638
      ac_init_llvm_once();
1639

1640
   if (radv_use_llvm_for_stage(device, stage)) {
1641
      llvm_compile_shader(device, shader_count, shaders, &binary, &args);
1642
   } else {
1643
      aco_compile_shader(shader_count, shaders, &binary, &args);
1644
   }
1645

1646
   binary->info = *info;
1647

1648
   struct radv_shader_variant *variant =
1649
      radv_shader_variant_create(device, binary, keep_shader_info);
1650
   if (!variant) {
1651
      free(binary);
1652
      return NULL;
1653
   }
1654

1655
   if (options->dump_shader) {
1656
      fprintf(stderr, "%s", radv_get_shader_name(info, shaders[0]->info.stage));
1657
      for (int i = 1; i < shader_count; ++i)
1658
         fprintf(stderr, " + %s", radv_get_shader_name(info, shaders[i]->info.stage));
1659

1660
      fprintf(stderr, "\ndisasm:\n%s\n", variant->disasm_string);
1661
   }
1662

1663
   if (keep_shader_info) {
1664
      variant->nir_string = radv_dump_nir_shaders(shaders, shader_count);
1665
      if (!gs_copy_shader && !trap_handler_shader && !module->nir) {
1666
         variant->spirv = malloc(module->size);
1667
         if (!variant->spirv) {
1668
            free(variant);
1669
            free(binary);
1670
            return NULL;
1671
         }
1672

1673
         memcpy(variant->spirv, module->data, module->size);
1674
         variant->spirv_size = module->size;
1675
      }
1676
   }
1677

1678
   if (binary_out)
1679
      *binary_out = binary;
1680
   else
1681
      free(binary);
1682

1683
   return variant;
1684
}
1685

1686
struct radv_shader_variant *
1687
radv_shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,
1688
                            struct nir_shader *const *shaders, int shader_count,
1689
                            struct radv_pipeline_layout *layout,
1690
                            const struct radv_shader_variant_key *key,
1691
                            struct radv_shader_info *info, bool keep_shader_info,
1692
                            bool keep_statistic_info, bool disable_optimizations,
1693
                            struct radv_shader_binary **binary_out)
1694
{
1695
   gl_shader_stage stage = shaders[shader_count - 1]->info.stage;
1696
   struct radv_nir_compiler_options options = {0};
1697

1698
   options.layout = layout;
1699
   if (key)
1700
      options.key = *key;
1701

1702
   options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1703
   options.robust_buffer_access = device->robust_buffer_access;
1704
   options.disable_optimizations = disable_optimizations;
1705
   options.wgp_mode = radv_should_use_wgp_mode(device, stage, info);
1706

1707
   return shader_variant_compile(device, module, shaders, shader_count, stage, info, &options,
1708
                                 false, false, keep_shader_info, keep_statistic_info, binary_out);
1709
}
1710

1711
struct radv_shader_variant *
1712
radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader,
1713
                           struct radv_shader_info *info, struct radv_shader_binary **binary_out,
1714
                           bool keep_shader_info, bool keep_statistic_info, bool multiview,
1715
                           bool disable_optimizations)
1716
{
1717
   struct radv_nir_compiler_options options = {0};
1718
   gl_shader_stage stage = MESA_SHADER_VERTEX;
1719

1720
   options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1721
   options.key.has_multiview_view_index = multiview;
1722
   options.disable_optimizations = disable_optimizations;
1723

1724
   return shader_variant_compile(device, NULL, &shader, 1, stage, info, &options, true, false,
1725
                                 keep_shader_info, keep_statistic_info, binary_out);
1726
}
1727

1728
struct radv_shader_variant *
1729
radv_create_trap_handler_shader(struct radv_device *device)
1730
{
1731
   struct radv_nir_compiler_options options = {0};
1732
   struct radv_shader_variant *shader = NULL;
1733
   struct radv_shader_binary *binary = NULL;
1734
   struct radv_shader_info info = {0};
1735

1736
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "meta_trap_handler");
1737

1738
   options.explicit_scratch_args = true;
1739
   options.wgp_mode = radv_should_use_wgp_mode(device, MESA_SHADER_COMPUTE, &info);
1740
   info.wave_size = 64;
1741

1742
   shader = shader_variant_compile(device, NULL, &b.shader, 1, MESA_SHADER_COMPUTE, &info, &options,
1743
                                   false, true, true, false, &binary);
1744

1745
   ralloc_free(b.shader);
1746
   free(binary);
1747

1748
   return shader;
1749
}
1750

1751
void
1752
radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)
1753
{
1754
   if (!p_atomic_dec_zero(&variant->ref_count))
1755
      return;
1756

1757
   mtx_lock(&device->shader_slab_mutex);
1758
   list_del(&variant->slab_list);
1759
   mtx_unlock(&device->shader_slab_mutex);
1760

1761
   free(variant->spirv);
1762
   free(variant->nir_string);
1763
   free(variant->disasm_string);
1764
   free(variant->ir_string);
1765
   free(variant->statistics);
1766
   free(variant);
1767
}
1768

1769
const char *
1770
radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage)
1771
{
1772
   switch (stage) {
1773
   case MESA_SHADER_VERTEX:
1774
      if (info->vs.as_ls)
1775
         return "Vertex Shader as LS";
1776
      else if (info->vs.as_es)
1777
         return "Vertex Shader as ES";
1778
      else if (info->is_ngg)
1779
         return "Vertex Shader as ESGS";
1780
      else
1781
         return "Vertex Shader as VS";
1782
   case MESA_SHADER_TESS_CTRL:
1783
      return "Tessellation Control Shader";
1784
   case MESA_SHADER_TESS_EVAL:
1785
      if (info->tes.as_es)
1786
         return "Tessellation Evaluation Shader as ES";
1787
      else if (info->is_ngg)
1788
         return "Tessellation Evaluation Shader as ESGS";
1789
      else
1790
         return "Tessellation Evaluation Shader as VS";
1791
   case MESA_SHADER_GEOMETRY:
1792
      return "Geometry Shader";
1793
   case MESA_SHADER_FRAGMENT:
1794
      return "Pixel Shader";
1795
   case MESA_SHADER_COMPUTE:
1796
      return "Compute Shader";
1797
   default:
1798
      return "Unknown shader";
1799
   };
1800
}
1801

1802
unsigned
1803
radv_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
1804
                            const unsigned *sizes)
1805
{
1806
   switch (stage) {
1807
   case MESA_SHADER_TESS_CTRL:
1808
      return chip_class >= GFX7 ? 128 : 64;
1809
   case MESA_SHADER_GEOMETRY:
1810
      return chip_class >= GFX9 ? 128 : 64;
1811
   case MESA_SHADER_COMPUTE:
1812
      break;
1813
   default:
1814
      return 0;
1815
   }
1816

1817
   unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];
1818
   return max_workgroup_size;
1819
}
1820

1821
unsigned
1822
radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *variant,
1823
                   gl_shader_stage stage)
1824
{
1825
   struct radeon_info *info = &device->physical_device->rad_info;
1826
   enum chip_class chip_class = info->chip_class;
1827
   uint8_t wave_size = variant->info.wave_size;
1828
   struct ac_shader_config *conf = &variant->config;
1829
   unsigned max_simd_waves;
1830
   unsigned lds_per_wave = 0;
1831

1832
   max_simd_waves = info->max_wave64_per_simd * (64 / wave_size);
1833

1834
   if (stage == MESA_SHADER_FRAGMENT) {
1835
      lds_per_wave =
1836
         conf->lds_size * info->lds_encode_granularity + variant->info.ps.num_interp * 48;
1837
      lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity);
1838
   } else if (stage == MESA_SHADER_COMPUTE) {
1839
      unsigned max_workgroup_size =
1840
         radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);
1841
      lds_per_wave =
1842
         align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity);
1843
      lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size);
1844
   }
1845

1846
   if (conf->num_sgprs && chip_class < GFX10) {
1847
      unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8);
1848
      max_simd_waves = MIN2(max_simd_waves, info->num_physical_sgprs_per_simd / sgprs);
1849
   }
1850

1851
   if (conf->num_vgprs) {
1852
      unsigned physical_vgprs = info->num_physical_wave64_vgprs_per_simd * (64 / wave_size);
1853
      unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);
1854
      if (chip_class >= GFX10_3)
1855
         vgprs = align(vgprs, wave_size == 32 ? 16 : 8);
1856
      max_simd_waves = MIN2(max_simd_waves, physical_vgprs / vgprs);
1857
   }
1858

1859
   unsigned simd_per_workgroup = info->num_simd_per_compute_unit;
1860
   if (chip_class >= GFX10)
1861
      simd_per_workgroup *= 2; /* like lds_size_per_workgroup, assume WGP on GFX10+ */
1862

1863
   unsigned max_lds_per_simd = info->lds_size_per_workgroup / simd_per_workgroup;
1864
   if (lds_per_wave)
1865
      max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_lds_per_simd, lds_per_wave));
1866

1867
   return chip_class >= GFX10 ? max_simd_waves * (wave_size / 32) : max_simd_waves;
1868
}
1869

1870
VkResult
1871
radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagBits shaderStage,
1872
                      VkShaderInfoTypeAMD infoType, size_t *pInfoSize, void *pInfo)
1873
{
1874
   RADV_FROM_HANDLE(radv_device, device, _device);
1875
   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
1876
   gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage);
1877
   struct radv_shader_variant *variant = pipeline->shaders[stage];
1878
   VkResult result = VK_SUCCESS;
1879

1880
   /* Spec doesn't indicate what to do if the stage is invalid, so just
1881
    * return no info for this. */
1882
   if (!variant)
1883
      return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
1884

1885
   switch (infoType) {
1886
   case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
1887
      if (!pInfo) {
1888
         *pInfoSize = sizeof(VkShaderStatisticsInfoAMD);
1889
      } else {
1890
         unsigned lds_multiplier = device->physical_device->rad_info.lds_encode_granularity;
1891
         struct ac_shader_config *conf = &variant->config;
1892

1893
         VkShaderStatisticsInfoAMD statistics = {0};
1894
         statistics.shaderStageMask = shaderStage;
1895
         statistics.numPhysicalVgprs =
1896
            device->physical_device->rad_info.num_physical_wave64_vgprs_per_simd;
1897
         statistics.numPhysicalSgprs =
1898
            device->physical_device->rad_info.num_physical_sgprs_per_simd;
1899
         statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
1900

1901
         if (stage == MESA_SHADER_COMPUTE) {
1902
            unsigned *local_size = variant->info.cs.block_size;
1903
            unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];
1904

1905
            statistics.numAvailableVgprs =
1906
               statistics.numPhysicalVgprs /
1907
               ceil((double)workgroup_size / statistics.numPhysicalVgprs);
1908

1909
            statistics.computeWorkGroupSize[0] = local_size[0];
1910
            statistics.computeWorkGroupSize[1] = local_size[1];
1911
            statistics.computeWorkGroupSize[2] = local_size[2];
1912
         } else {
1913
            statistics.numAvailableVgprs = statistics.numPhysicalVgprs;
1914
         }
1915

1916
         statistics.resourceUsage.numUsedVgprs = conf->num_vgprs;
1917
         statistics.resourceUsage.numUsedSgprs = conf->num_sgprs;
1918
         statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768;
1919
         statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier;
1920
         statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave;
1921

1922
         size_t size = *pInfoSize;
1923
         *pInfoSize = sizeof(statistics);
1924

1925
         memcpy(pInfo, &statistics, MIN2(size, *pInfoSize));
1926

1927
         if (size < *pInfoSize)
1928
            result = VK_INCOMPLETE;
1929
      }
1930

1931
      break;
1932
   case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: {
1933
      char *out;
1934
      size_t outsize;
1935
      struct u_memstream mem;
1936
      u_memstream_open(&mem, &out, &outsize);
1937
      FILE *const memf = u_memstream_get(&mem);
1938

1939
      fprintf(memf, "%s:\n", radv_get_shader_name(&variant->info, stage));
1940
      fprintf(memf, "%s\n\n", variant->ir_string);
1941
      fprintf(memf, "%s\n\n", variant->disasm_string);
1942
      radv_dump_shader_stats(device, pipeline, stage, memf);
1943
      u_memstream_close(&mem);
1944

1945
      /* Need to include the null terminator. */
1946
      size_t length = outsize + 1;
1947

1948
      if (!pInfo) {
1949
         *pInfoSize = length;
1950
      } else {
1951
         size_t size = *pInfoSize;
1952
         *pInfoSize = length;
1953

1954
         memcpy(pInfo, out, MIN2(size, length));
1955

1956
         if (size < length)
1957
            result = VK_INCOMPLETE;
1958
      }
1959

1960
      free(out);
1961
      break;
1962
   }
1963
   default:
1964
      /* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */
1965
      result = VK_ERROR_FEATURE_NOT_PRESENT;
1966
      break;
1967
   }
1968

1969
   return result;
1970
}
1971

1972
VkResult
1973
radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline,
1974
                       gl_shader_stage stage, FILE *output)
1975
{
1976
   struct radv_shader_variant *shader = pipeline->shaders[stage];
1977
   VkPipelineExecutablePropertiesKHR *props = NULL;
1978
   uint32_t prop_count = 0;
1979
   VkResult result;
1980

1981
   VkPipelineInfoKHR pipeline_info = {0};
1982
   pipeline_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR;
1983
   pipeline_info.pipeline = radv_pipeline_to_handle(pipeline);
1984

1985
   result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,
1986
                                                    &prop_count, NULL);
1987
   if (result != VK_SUCCESS)
1988
      return result;
1989

1990
   props = calloc(prop_count, sizeof(*props));
1991
   if (!props)
1992
      return VK_ERROR_OUT_OF_HOST_MEMORY;
1993

1994
   result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,
1995
                                                    &prop_count, props);
1996
   if (result != VK_SUCCESS)
1997
      goto fail;
1998

1999
   for (unsigned exec_idx = 0; exec_idx < prop_count; exec_idx++) {
2000
      if (!(props[exec_idx].stages & mesa_to_vk_shader_stage(stage)))
2001
         continue;
2002

2003
      VkPipelineExecutableStatisticKHR *stats = NULL;
2004
      uint32_t stat_count = 0;
2005

2006
      VkPipelineExecutableInfoKHR exec_info = {0};
2007
      exec_info.pipeline = radv_pipeline_to_handle(pipeline);
2008
      exec_info.executableIndex = exec_idx;
2009

2010
      result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,
2011
                                                       &stat_count, NULL);
2012
      if (result != VK_SUCCESS)
2013
         goto fail;
2014

2015
      stats = calloc(stat_count, sizeof(*stats));
2016
      if (!stats) {
2017
         result = VK_ERROR_OUT_OF_HOST_MEMORY;
2018
         goto fail;
2019
      }
2020

2021
      result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,
2022
                                                       &stat_count, stats);
2023
      if (result != VK_SUCCESS) {
2024
         free(stats);
2025
         goto fail;
2026
      }
2027

2028
      fprintf(output, "\n%s:\n", radv_get_shader_name(&shader->info, stage));
2029
      fprintf(output, "*** SHADER STATS ***\n");
2030

2031
      for (unsigned i = 0; i < stat_count; i++) {
2032
         fprintf(output, "%s: ", stats[i].name);
2033
         switch (stats[i].format) {
2034
         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
2035
            fprintf(output, "%s", stats[i].value.b32 == VK_TRUE ? "true" : "false");
2036
            break;
2037
         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
2038
            fprintf(output, "%" PRIi64, stats[i].value.i64);
2039
            break;
2040
         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
2041
            fprintf(output, "%" PRIu64, stats[i].value.u64);
2042
            break;
2043
         case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
2044
            fprintf(output, "%f", stats[i].value.f64);
2045
            break;
2046
         default:
2047
            unreachable("Invalid pipeline statistic format");
2048
         }
2049
         fprintf(output, "\n");
2050
      }
2051

2052
      fprintf(output, "********************\n\n\n");
2053

2054
      free(stats);
2055
   }
2056

2057
fail:
2058
   free(props);
2059
   return result;
2060
}
2061

2062
Product

Resources

Company