Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_shader.c
7233 views
1
/*
2
* Copyright © 2016 Red Hat.
3
* Copyright © 2016 Bas Nieuwenhuizen
4
*
5
* based in part on anv driver which is:
6
* Copyright © 2015 Intel Corporation
7
*
8
* Permission is hereby granted, free of charge, to any person obtaining a
9
* copy of this software and associated documentation files (the "Software"),
10
* to deal in the Software without restriction, including without limitation
11
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
* and/or sell copies of the Software, and to permit persons to whom the
13
* Software is furnished to do so, subject to the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the next
16
* paragraph) shall be included in all copies or substantial portions of the
17
* Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
* IN THE SOFTWARE.
26
*/
27
28
#include "radv_shader.h"
29
#include "nir/nir.h"
30
#include "nir/nir_builder.h"
31
#include "spirv/nir_spirv.h"
32
#include "util/memstream.h"
33
#include "util/mesa-sha1.h"
34
#include "util/u_atomic.h"
35
#include "radv_debug.h"
36
#include "radv_private.h"
37
#include "radv_shader_args.h"
38
#include "radv_shader_helper.h"
39
40
#include "util/debug.h"
41
#include "ac_binary.h"
42
#include "ac_exp_param.h"
43
#include "ac_llvm_util.h"
44
#include "ac_nir.h"
45
#include "ac_rtld.h"
46
#include "aco_interface.h"
47
#include "sid.h"
48
#include "vk_format.h"
49
50
static const struct nir_shader_compiler_options nir_options = {
51
.vertex_id_zero_based = true,
52
.lower_scmp = true,
53
.lower_flrp16 = true,
54
.lower_flrp32 = true,
55
.lower_flrp64 = true,
56
.lower_device_index_to_zero = true,
57
.lower_fdiv = true,
58
.lower_fmod = true,
59
.lower_ineg = true,
60
.lower_bitfield_insert_to_bitfield_select = true,
61
.lower_bitfield_extract = true,
62
.lower_pack_snorm_2x16 = true,
63
.lower_pack_snorm_4x8 = true,
64
.lower_pack_unorm_2x16 = true,
65
.lower_pack_unorm_4x8 = true,
66
.lower_pack_half_2x16 = true,
67
.lower_pack_64_2x32 = true,
68
.lower_pack_64_4x16 = true,
69
.lower_pack_32_2x16 = true,
70
.lower_unpack_snorm_2x16 = true,
71
.lower_unpack_snorm_4x8 = true,
72
.lower_unpack_unorm_2x16 = true,
73
.lower_unpack_unorm_4x8 = true,
74
.lower_unpack_half_2x16 = true,
75
.lower_ffma16 = true,
76
.lower_ffma32 = true,
77
.lower_ffma64 = true,
78
.lower_fpow = true,
79
.lower_mul_2x32_64 = true,
80
.lower_rotate = true,
81
.has_fsub = true,
82
.has_isub = true,
83
.use_scoped_barrier = true,
84
.max_unroll_iterations = 32,
85
.max_unroll_iterations_aggressive = 128,
86
.use_interpolated_input_intrinsics = true,
87
.vectorize_vec2_16bit = true,
88
/* nir_lower_int64() isn't actually called for the LLVM backend, but
89
* this helps the loop unrolling heuristics. */
90
.lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
91
nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64,
92
.lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv,
93
.divergence_analysis_options = nir_divergence_view_index_uniform,
94
};
95
96
bool
97
radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module,
98
bool meta_shader)
99
{
100
if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
101
return false;
102
if (module)
103
return !module->nir || (device->instance->debug_flags & RADV_DEBUG_DUMP_META_SHADERS);
104
105
return meta_shader;
106
}
107
108
bool
109
radv_can_dump_shader_stats(struct radv_device *device, struct vk_shader_module *module)
110
{
111
/* Only dump non-meta shader stats. */
112
return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS && module && !module->nir;
113
}
114
115
void
116
radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
117
bool optimize_conservatively, bool allow_copies)
118
{
119
bool progress;
120
unsigned lower_flrp = (shader->options->lower_flrp16 ? 16 : 0) |
121
(shader->options->lower_flrp32 ? 32 : 0) |
122
(shader->options->lower_flrp64 ? 64 : 0);
123
124
do {
125
progress = false;
126
127
NIR_PASS(progress, shader, nir_split_array_vars, nir_var_function_temp);
128
NIR_PASS(progress, shader, nir_shrink_vec_array_vars, nir_var_function_temp);
129
130
NIR_PASS_V(shader, nir_lower_vars_to_ssa);
131
132
if (allow_copies) {
133
/* Only run this pass in the first call to
134
* radv_optimize_nir. Later calls assume that we've
135
* lowered away any copy_deref instructions and we
136
* don't want to introduce any more.
137
*/
138
NIR_PASS(progress, shader, nir_opt_find_array_copies);
139
}
140
141
NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
142
NIR_PASS(progress, shader, nir_opt_dead_write_vars);
143
NIR_PASS(progress, shader, nir_remove_dead_variables,
144
nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, NULL);
145
146
NIR_PASS_V(shader, nir_lower_alu_to_scalar, NULL, NULL);
147
NIR_PASS_V(shader, nir_lower_phis_to_scalar, true);
148
149
NIR_PASS(progress, shader, nir_copy_prop);
150
NIR_PASS(progress, shader, nir_opt_remove_phis);
151
NIR_PASS(progress, shader, nir_opt_dce);
152
if (nir_opt_trivial_continues(shader)) {
153
progress = true;
154
NIR_PASS(progress, shader, nir_copy_prop);
155
NIR_PASS(progress, shader, nir_opt_remove_phis);
156
NIR_PASS(progress, shader, nir_opt_dce);
157
}
158
NIR_PASS(progress, shader, nir_opt_if, true);
159
NIR_PASS(progress, shader, nir_opt_dead_cf);
160
NIR_PASS(progress, shader, nir_opt_cse);
161
NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true);
162
NIR_PASS(progress, shader, nir_opt_constant_folding);
163
NIR_PASS(progress, shader, nir_opt_algebraic);
164
165
if (lower_flrp != 0) {
166
bool lower_flrp_progress = false;
167
NIR_PASS(lower_flrp_progress, shader, nir_lower_flrp, lower_flrp,
168
false /* always_precise */);
169
if (lower_flrp_progress) {
170
NIR_PASS(progress, shader, nir_opt_constant_folding);
171
progress = true;
172
}
173
174
/* Nothing should rematerialize any flrps, so we only
175
* need to do this lowering once.
176
*/
177
lower_flrp = 0;
178
}
179
180
NIR_PASS(progress, shader, nir_opt_undef);
181
NIR_PASS(progress, shader, nir_opt_shrink_vectors,
182
!device->instance->disable_shrink_image_store);
183
if (shader->options->max_unroll_iterations) {
184
NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
185
}
186
} while (progress && !optimize_conservatively);
187
188
NIR_PASS(progress, shader, nir_opt_conditional_discard);
189
NIR_PASS(progress, shader, nir_opt_move, nir_move_load_ubo);
190
}
191
192
void
193
radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
194
{
195
bool more_algebraic = true;
196
while (more_algebraic) {
197
more_algebraic = false;
198
NIR_PASS_V(nir, nir_copy_prop);
199
NIR_PASS_V(nir, nir_opt_dce);
200
NIR_PASS_V(nir, nir_opt_constant_folding);
201
NIR_PASS_V(nir, nir_opt_cse);
202
NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
203
}
204
205
if (opt_offsets)
206
NIR_PASS_V(nir, nir_opt_offsets);
207
208
/* Do late algebraic optimization to turn add(a,
209
* neg(b)) back into subs, then the mandatory cleanup
210
* after algebraic. Note that it may produce fnegs,
211
* and if so then we need to keep running to squash
212
* fneg(fneg(a)).
213
*/
214
bool more_late_algebraic = true;
215
while (more_late_algebraic) {
216
more_late_algebraic = false;
217
NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
218
NIR_PASS_V(nir, nir_opt_constant_folding);
219
NIR_PASS_V(nir, nir_copy_prop);
220
NIR_PASS_V(nir, nir_opt_dce);
221
NIR_PASS_V(nir, nir_opt_cse);
222
}
223
}
224
225
static void
226
shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
227
{
228
assert(glsl_type_is_vector_or_scalar(type));
229
230
uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
231
unsigned length = glsl_get_vector_elements(type);
232
*size = comp_size * length, *align = comp_size;
233
}
234
235
struct radv_shader_debug_data {
236
struct radv_device *device;
237
const struct vk_shader_module *module;
238
};
239
240
static void
241
radv_spirv_nir_debug(void *private_data, enum nir_spirv_debug_level level, size_t spirv_offset,
242
const char *message)
243
{
244
struct radv_shader_debug_data *debug_data = private_data;
245
struct radv_instance *instance = debug_data->device->instance;
246
247
static const VkDebugReportFlagsEXT vk_flags[] = {
248
[NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,
249
[NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,
250
[NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
251
};
252
char buffer[256];
253
254
snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long)spirv_offset, message);
255
256
vk_debug_report(&instance->vk, vk_flags[level], &debug_data->module->base, 0, 0, "radv", buffer);
257
}
258
259
static void
260
radv_compiler_debug(void *private_data, enum radv_compiler_debug_level level, const char *message)
261
{
262
struct radv_shader_debug_data *debug_data = private_data;
263
struct radv_instance *instance = debug_data->device->instance;
264
265
static const VkDebugReportFlagsEXT vk_flags[] = {
266
[RADV_COMPILER_DEBUG_LEVEL_PERFWARN] = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
267
[RADV_COMPILER_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
268
};
269
270
/* VK_DEBUG_REPORT_DEBUG_BIT_EXT specifies diagnostic information
271
* from the implementation and layers.
272
*/
273
vk_debug_report(&instance->vk, vk_flags[level] | VK_DEBUG_REPORT_DEBUG_BIT_EXT,
274
&debug_data->module->base, 0, 0, "radv", message);
275
}
276
277
static nir_ssa_def *
278
convert_pointer_to_64(nir_builder *b, const struct radv_physical_device *pdev, nir_ssa_def *ptr)
279
{
280
nir_ssa_def *comp[] = {ptr, nir_imm_int(b, pdev->rad_info.address32_hi)};
281
return nir_pack_64_2x32(b, nir_vec(b, comp, 2));
282
}
283
284
static bool
285
lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
286
const struct radv_pipeline_layout *layout, const struct radv_physical_device *pdev)
287
{
288
nir_function_impl *entry = nir_shader_get_entrypoint(nir);
289
bool progress = false;
290
nir_builder b;
291
292
nir_builder_init(&b, entry);
293
294
nir_foreach_block (block, entry) {
295
nir_foreach_instr_safe (instr, block) {
296
if (instr->type != nir_instr_type_intrinsic)
297
continue;
298
299
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
300
b.cursor = nir_before_instr(&intrin->instr);
301
302
nir_ssa_def *def = NULL;
303
switch (intrin->intrinsic) {
304
case nir_intrinsic_load_vulkan_descriptor:
305
if (nir_intrinsic_desc_type(intrin) == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) {
306
nir_ssa_def *addr =
307
convert_pointer_to_64(&b, pdev,
308
nir_iadd(&b, nir_channels(&b, intrin->src[0].ssa, 1),
309
nir_channels(&b, intrin->src[0].ssa, 2)));
310
311
def = nir_build_load_global(&b, 1, 64, addr, .access = ACCESS_NON_WRITEABLE,
312
.align_mul = 8, .align_offset = 0);
313
} else {
314
def = nir_vec3(&b, nir_channel(&b, intrin->src[0].ssa, 0),
315
nir_channel(&b, intrin->src[0].ssa, 1), nir_imm_int(&b, 0));
316
}
317
break;
318
case nir_intrinsic_vulkan_resource_index: {
319
unsigned desc_set = nir_intrinsic_desc_set(intrin);
320
unsigned binding = nir_intrinsic_binding(intrin);
321
struct radv_descriptor_set_layout *desc_layout = layout->set[desc_set].layout;
322
323
nir_ssa_def *new_res = nir_vulkan_resource_index(
324
&b, 3, 32, intrin->src[0].ssa, .desc_set = desc_set, .binding = binding,
325
.desc_type = nir_intrinsic_desc_type(intrin));
326
nir_ssa_def *set_ptr = nir_channel(&b, new_res, 0);
327
nir_ssa_def *binding_ptr = nir_channel(&b, new_res, 1);
328
329
nir_ssa_def *stride;
330
if (desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
331
desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
332
stride = nir_imm_int(&b, 16);
333
} else {
334
stride = nir_imm_int(&b, desc_layout->binding[binding].size);
335
}
336
def = nir_vec3(&b, set_ptr, binding_ptr, stride);
337
break;
338
}
339
case nir_intrinsic_vulkan_resource_reindex: {
340
nir_ssa_def *set_ptr = nir_channel(&b, intrin->src[0].ssa, 0);
341
nir_ssa_def *binding_ptr = nir_channel(&b, intrin->src[0].ssa, 1);
342
nir_ssa_def *stride = nir_channel(&b, intrin->src[0].ssa, 2);
343
binding_ptr = nir_iadd(&b, binding_ptr, nir_imul(&b, intrin->src[1].ssa, stride));
344
def = nir_vec3(&b, set_ptr, binding_ptr, stride);
345
break;
346
}
347
case nir_intrinsic_is_sparse_texels_resident:
348
def = nir_ieq_imm(&b, intrin->src[0].ssa, 0);
349
break;
350
case nir_intrinsic_sparse_residency_code_and:
351
def = nir_ior(&b, intrin->src[0].ssa, intrin->src[1].ssa);
352
break;
353
case nir_intrinsic_load_view_index:
354
if (key->has_multiview_view_index)
355
continue;
356
def = nir_imm_zero(&b, 1, 32);
357
break;
358
default:
359
continue;
360
}
361
362
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, def);
363
364
nir_instr_remove(instr);
365
progress = true;
366
}
367
}
368
369
return progress;
370
}
371
372
static bool
373
radv_lower_primitive_shading_rate(nir_shader *nir)
374
{
375
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
376
bool progress = false;
377
378
nir_builder b;
379
nir_builder_init(&b, impl);
380
381
/* Iterate in reverse order since there should be only one deref store to PRIMITIVE_SHADING_RATE
382
* after lower_io_to_temporaries for vertex shaders.
383
*/
384
nir_foreach_block_reverse(block, impl) {
385
nir_foreach_instr_reverse(instr, block) {
386
if (instr->type != nir_instr_type_intrinsic)
387
continue;
388
389
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
390
if (intr->intrinsic != nir_intrinsic_store_deref)
391
continue;
392
393
nir_variable *var = nir_intrinsic_get_var(intr, 0);
394
if (var->data.mode != nir_var_shader_out ||
395
var->data.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)
396
continue;
397
398
b.cursor = nir_before_instr(instr);
399
400
nir_ssa_def *val = nir_ssa_for_src(&b, intr->src[1], 1);
401
402
/* x_rate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */
403
nir_ssa_def *x_rate = nir_iand(&b, val, nir_imm_int(&b, 12));
404
x_rate = nir_b2i32(&b, nir_ine(&b, x_rate, nir_imm_int(&b, 0)));
405
406
/* y_rate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */
407
nir_ssa_def *y_rate = nir_iand(&b, val, nir_imm_int(&b, 3));
408
y_rate = nir_b2i32(&b, nir_ine(&b, y_rate, nir_imm_int(&b, 0)));
409
410
/* Bits [2:3] = VRS rate X
411
* Bits [4:5] = VRS rate Y
412
* HW shading rate = (xRate << 2) | (yRate << 4)
413
*/
414
nir_ssa_def *out = nir_ior(&b, nir_ishl(&b, x_rate, nir_imm_int(&b, 2)),
415
nir_ishl(&b, y_rate, nir_imm_int(&b, 4)));
416
417
nir_instr_rewrite_src(&intr->instr, &intr->src[1], nir_src_for_ssa(out));
418
419
progress = true;
420
if (nir->info.stage == MESA_SHADER_VERTEX)
421
return progress;
422
}
423
}
424
425
return progress;
426
}
427
428
nir_shader *
429
radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,
430
const char *entrypoint_name, gl_shader_stage stage,
431
const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags,
432
const struct radv_pipeline_layout *layout,
433
const struct radv_pipeline_key *key)
434
{
435
unsigned subgroup_size = 64, ballot_bit_size = 64;
436
if (key->compute_subgroup_size) {
437
/* Only compute shaders currently support requiring a
438
* specific subgroup size.
439
*/
440
assert(stage == MESA_SHADER_COMPUTE);
441
subgroup_size = key->compute_subgroup_size;
442
ballot_bit_size = key->compute_subgroup_size;
443
}
444
445
nir_shader *nir;
446
447
if (module->nir) {
448
/* Some things such as our meta clear/blit code will give us a NIR
449
* shader directly. In that case, we just ignore the SPIR-V entirely
450
* and just use the NIR shader */
451
nir = module->nir;
452
nir->options = &nir_options;
453
nir_validate_shader(nir, "in internal shader");
454
455
assert(exec_list_length(&nir->functions) == 1);
456
} else {
457
uint32_t *spirv = (uint32_t *)module->data;
458
assert(module->size % 4 == 0);
459
460
if (device->instance->debug_flags & RADV_DEBUG_DUMP_SPIRV)
461
radv_print_spirv(module->data, module->size, stderr);
462
463
uint32_t num_spec_entries = 0;
464
struct nir_spirv_specialization *spec_entries = NULL;
465
if (spec_info && spec_info->mapEntryCount > 0) {
466
num_spec_entries = spec_info->mapEntryCount;
467
spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));
468
for (uint32_t i = 0; i < num_spec_entries; i++) {
469
VkSpecializationMapEntry entry = spec_info->pMapEntries[i];
470
const void *data = (uint8_t *)spec_info->pData + entry.offset;
471
assert((uint8_t *)data + entry.size <=
472
(uint8_t *)spec_info->pData + spec_info->dataSize);
473
474
spec_entries[i].id = spec_info->pMapEntries[i].constantID;
475
switch (entry.size) {
476
case 8:
477
memcpy(&spec_entries[i].value.u64, data, sizeof(uint64_t));
478
break;
479
case 4:
480
memcpy(&spec_entries[i].value.u32, data, sizeof(uint32_t));
481
break;
482
case 2:
483
memcpy(&spec_entries[i].value.u16, data, sizeof(uint16_t));
484
break;
485
case 1:
486
memcpy(&spec_entries[i].value.u8, data, sizeof(uint8_t));
487
break;
488
case 0:
489
/* The Vulkan spec says:
490
*
491
* "For a constantID specialization constant declared in a shader, size must match
492
* the byte size of the constantID. If the specialization constant is of type
493
* boolean, size must be the byte size of VkBool32."
494
*
495
* Therefore, since only scalars can be decorated as specialization constants, we can
496
* assume that if it doesn't have a size of 1, 2, 4, or 8, any use in a shader would
497
* be invalid usage. The spec further says:
498
*
499
* "If a constantID value is not a specialization constant ID used in the shader,
500
* that map entry does not affect the behavior of the pipeline."
501
*
502
* so we should ignore any invalid specialization constants rather than crash or
503
* error out when we see one.
504
*/
505
break;
506
default:
507
assert(!"Invalid spec constant size");
508
break;
509
}
510
}
511
}
512
513
struct radv_shader_debug_data spirv_debug_data = {
514
.device = device,
515
.module = module,
516
};
517
const struct spirv_to_nir_options spirv_options = {
518
.caps =
519
{
520
.amd_fragment_mask = true,
521
.amd_gcn_shader = true,
522
.amd_image_gather_bias_lod = true,
523
.amd_image_read_write_lod = true,
524
.amd_shader_ballot = true,
525
.amd_shader_explicit_vertex_parameter = true,
526
.amd_trinary_minmax = true,
527
.demote_to_helper_invocation = true,
528
.derivative_group = true,
529
.descriptor_array_dynamic_indexing = true,
530
.descriptor_array_non_uniform_indexing = true,
531
.descriptor_indexing = true,
532
.device_group = true,
533
.draw_parameters = true,
534
.float_controls = true,
535
.float16 = device->physical_device->rad_info.has_packed_math_16bit,
536
.float32_atomic_add = true,
537
.float64 = true,
538
.geometry_streams = true,
539
.image_atomic_int64 = true,
540
.image_ms_array = true,
541
.image_read_without_format = true,
542
.image_write_without_format = true,
543
.int8 = true,
544
.int16 = true,
545
.int64 = true,
546
.int64_atomics = true,
547
.min_lod = true,
548
.multiview = true,
549
.physical_storage_buffer_address = true,
550
.post_depth_coverage = true,
551
.runtime_descriptor_array = true,
552
.shader_clock = true,
553
.shader_viewport_index_layer = true,
554
.sparse_residency = true,
555
.stencil_export = true,
556
.storage_8bit = true,
557
.storage_16bit = true,
558
.storage_image_ms = true,
559
.subgroup_arithmetic = true,
560
.subgroup_ballot = true,
561
.subgroup_basic = true,
562
.subgroup_quad = true,
563
.subgroup_shuffle = true,
564
.subgroup_uniform_control_flow = true,
565
.subgroup_vote = true,
566
.tessellation = true,
567
.transform_feedback = true,
568
.variable_pointers = true,
569
.vk_memory_model = true,
570
.vk_memory_model_device_scope = true,
571
.fragment_shading_rate = device->physical_device->rad_info.chip_class >= GFX10_3,
572
.workgroup_memory_explicit_layout = true,
573
},
574
.ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
575
.ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
576
.phys_ssbo_addr_format = nir_address_format_64bit_global,
577
.push_const_addr_format = nir_address_format_logical,
578
.shared_addr_format = nir_address_format_32bit_offset,
579
.frag_coord_is_sysval = true,
580
.use_deref_buffer_array_length = true,
581
.debug =
582
{
583
.func = radv_spirv_nir_debug,
584
.private_data = &spirv_debug_data,
585
},
586
};
587
nir = spirv_to_nir(spirv, module->size / 4, spec_entries, num_spec_entries, stage,
588
entrypoint_name, &spirv_options, &nir_options);
589
assert(nir->info.stage == stage);
590
nir_validate_shader(nir, "after spirv_to_nir");
591
592
free(spec_entries);
593
594
/* We have to lower away local constant initializers right before we
595
* inline functions. That way they get properly initialized at the top
596
* of the function and not at the top of its caller.
597
*/
598
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
599
NIR_PASS_V(nir, nir_lower_returns);
600
NIR_PASS_V(nir, nir_inline_functions);
601
NIR_PASS_V(nir, nir_copy_prop);
602
NIR_PASS_V(nir, nir_opt_deref);
603
604
/* Pick off the single entrypoint that we want */
605
foreach_list_typed_safe(nir_function, func, node, &nir->functions)
606
{
607
if (func->is_entrypoint)
608
func->name = ralloc_strdup(func, "main");
609
else
610
exec_node_remove(&func->node);
611
}
612
assert(exec_list_length(&nir->functions) == 1);
613
614
/* Make sure we lower constant initializers on output variables so that
615
* nir_remove_dead_variables below sees the corresponding stores
616
*/
617
NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);
618
619
/* Now that we've deleted all but the main function, we can go ahead and
620
* lower the rest of the constant initializers.
621
*/
622
NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
623
624
/* Split member structs. We do this before lower_io_to_temporaries so that
625
* it doesn't lower system values to temporaries by accident.
626
*/
627
NIR_PASS_V(nir, nir_split_var_copies);
628
NIR_PASS_V(nir, nir_split_per_member_structs);
629
630
if (nir->info.stage == MESA_SHADER_FRAGMENT)
631
NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
632
if (nir->info.stage == MESA_SHADER_FRAGMENT)
633
NIR_PASS_V(nir, nir_lower_input_attachments,
634
&(nir_input_attachment_options){
635
.use_fragcoord_sysval = true,
636
.use_layer_id_sysval = false,
637
});
638
639
NIR_PASS_V(nir, nir_remove_dead_variables,
640
nir_var_shader_in | nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
641
NULL);
642
643
/* Variables can make nir_propagate_invariant more conservative
644
* than it needs to be.
645
*/
646
NIR_PASS_V(nir, nir_lower_global_vars_to_local);
647
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
648
649
NIR_PASS_V(nir, nir_propagate_invariant,
650
device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM);
651
652
NIR_PASS_V(nir, nir_lower_system_values);
653
NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
654
655
NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
656
657
NIR_PASS_V(nir, nir_lower_discard_or_demote,
658
device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE);
659
660
nir_lower_doubles_options lower_doubles = nir->options->lower_doubles_options;
661
662
if (device->physical_device->rad_info.chip_class == GFX6) {
663
/* GFX6 doesn't support v_floor_f64 and the precision
664
* of v_fract_f64 which is used to implement 64-bit
665
* floor is less than what Vulkan requires.
666
*/
667
lower_doubles |= nir_lower_dfloor;
668
}
669
670
NIR_PASS_V(nir, nir_lower_doubles, NULL, lower_doubles);
671
}
672
673
/* Vulkan uses the separate-shader linking model */
674
nir->info.separate_shader = true;
675
676
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
677
678
if (nir->info.stage == MESA_SHADER_GEOMETRY) {
679
unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;
680
681
if (device->physical_device->use_ngg && !radv_use_llvm_for_stage(device, stage)) {
682
/* ACO needs NIR to do some of the hard lifting */
683
nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |
684
nir_lower_gs_intrinsics_count_vertices_per_primitive |
685
nir_lower_gs_intrinsics_overwrite_incomplete;
686
}
687
688
nir_lower_gs_intrinsics(nir, nir_gs_flags);
689
}
690
691
static const nir_lower_tex_options tex_options = {
692
.lower_txp = ~0,
693
.lower_tg4_offsets = true,
694
};
695
696
nir_lower_tex(nir, &tex_options);
697
698
nir_lower_vars_to_ssa(nir);
699
700
if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_GEOMETRY ||
701
nir->info.stage == MESA_SHADER_FRAGMENT) {
702
NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true);
703
} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
704
NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, false);
705
}
706
707
nir_split_var_copies(nir);
708
709
nir_lower_global_vars_to_local(nir);
710
nir_remove_dead_variables(nir, nir_var_function_temp, NULL);
711
bool gfx7minus = device->physical_device->rad_info.chip_class <= GFX7;
712
nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options){
713
.subgroup_size = subgroup_size,
714
.ballot_bit_size = ballot_bit_size,
715
.ballot_components = 1,
716
.lower_to_scalar = 1,
717
.lower_subgroup_masks = 1,
718
.lower_shuffle = 1,
719
.lower_shuffle_to_32bit = 1,
720
.lower_vote_eq = 1,
721
.lower_quad_broadcast_dynamic = 1,
722
.lower_quad_broadcast_dynamic_to_const = gfx7minus,
723
.lower_shuffle_to_swizzle_amd = 1,
724
.lower_elect = radv_use_llvm_for_stage(device, stage),
725
});
726
727
nir_lower_load_const_to_scalar(nir);
728
729
if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
730
radv_optimize_nir(device, nir, false, true);
731
732
/* call radv_nir_lower_ycbcr_textures() late as there might still be
733
* tex with undef texture/sampler before first optimization */
734
NIR_PASS_V(nir, radv_nir_lower_ycbcr_textures, layout);
735
736
/* We call nir_lower_var_copies() after the first radv_optimize_nir()
737
* to remove any copies introduced by nir_opt_find_array_copies().
738
*/
739
nir_lower_var_copies(nir);
740
741
const nir_opt_access_options opt_access_options = {
742
.is_vulkan = true,
743
.infer_non_readable = true,
744
};
745
NIR_PASS_V(nir, nir_opt_access, &opt_access_options);
746
747
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset);
748
749
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo | nir_var_mem_ssbo,
750
nir_address_format_vec2_index_32bit_offset);
751
752
NIR_PASS_V(nir, lower_intrinsics, key, layout, device->physical_device);
753
754
/* Lower deref operations for compute shared memory. */
755
if (nir->info.stage == MESA_SHADER_COMPUTE) {
756
if (!nir->info.shared_memory_explicit_layout) {
757
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info);
758
}
759
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
760
761
if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
762
const unsigned chunk_size = 16; /* max single store size */
763
const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
764
NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
765
}
766
}
767
768
nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
769
770
/* Lower large variables that are always constant with load_constant
771
* intrinsics, which get turned into PC-relative loads from a data
772
* section next to the shader.
773
*/
774
NIR_PASS_V(nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
775
776
/* Lower primitive shading rate to match HW requirements. */
777
if ((nir->info.stage == MESA_SHADER_VERTEX ||
778
nir->info.stage == MESA_SHADER_GEOMETRY) &&
779
nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
780
NIR_PASS_V(nir, radv_lower_primitive_shading_rate);
781
}
782
783
/* Indirect lowering must be called after the radv_optimize_nir() loop
784
* has been called at least once. Otherwise indirect lowering can
785
* bloat the instruction count of the loop and cause it to be
786
* considered too large for unrolling.
787
*/
788
if (ac_nir_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class) &&
789
!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) &&
790
nir->info.stage != MESA_SHADER_COMPUTE) {
791
/* Optimize the lowered code before the linking optimizations. */
792
radv_optimize_nir(device, nir, false, false);
793
}
794
795
return nir;
796
}
797
798
static int
799
type_size_vec4(const struct glsl_type *type, bool bindless)
800
{
801
return glsl_count_attribute_slots(type, false);
802
}
803
804
static nir_variable *
805
find_layer_in_var(nir_shader *nir)
806
{
807
nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_LAYER);
808
if (var != NULL)
809
return var;
810
811
var = nir_variable_create(nir, nir_var_shader_in, glsl_int_type(), "layer id");
812
var->data.location = VARYING_SLOT_LAYER;
813
var->data.interpolation = INTERP_MODE_FLAT;
814
return var;
815
}
816
817
/* We use layered rendering to implement multiview, which means we need to map
818
* view_index to gl_Layer. The code generates a load from the layer_id sysval,
819
* but since we don't have a way to get at this information from the fragment
820
* shader, we also need to lower this to the gl_Layer varying. This pass
821
* lowers both to a varying load from the LAYER slot, before lowering io, so
822
* that nir_assign_var_locations() will give the LAYER varying the correct
823
* driver_location.
824
*/
825
826
static bool
827
lower_view_index(nir_shader *nir)
828
{
829
bool progress = false;
830
nir_function_impl *entry = nir_shader_get_entrypoint(nir);
831
nir_builder b;
832
nir_builder_init(&b, entry);
833
834
nir_variable *layer = NULL;
835
nir_foreach_block (block, entry) {
836
nir_foreach_instr_safe (instr, block) {
837
if (instr->type != nir_instr_type_intrinsic)
838
continue;
839
840
nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
841
if (load->intrinsic != nir_intrinsic_load_view_index)
842
continue;
843
844
if (!layer)
845
layer = find_layer_in_var(nir);
846
847
b.cursor = nir_before_instr(instr);
848
nir_ssa_def *def = nir_load_var(&b, layer);
849
nir_ssa_def_rewrite_uses(&load->dest.ssa, def);
850
851
nir_instr_remove(instr);
852
progress = true;
853
}
854
}
855
856
return progress;
857
}
858
859
void
860
radv_lower_io(struct radv_device *device, nir_shader *nir)
861
{
862
if (nir->info.stage == MESA_SHADER_COMPUTE)
863
return;
864
865
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
866
NIR_PASS_V(nir, lower_view_index);
867
nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, MESA_SHADER_FRAGMENT);
868
}
869
870
/* The RADV/LLVM backend expects 64-bit IO to be lowered. */
871
nir_lower_io_options options =
872
radv_use_llvm_for_stage(device, nir->info.stage) ? nir_lower_io_lower_64bit_to_32 : 0;
873
874
NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, options);
875
876
/* This pass needs actual constants */
877
nir_opt_constant_folding(nir);
878
879
NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
880
}
881
882
bool
883
radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
884
struct radv_shader_info *info, const struct radv_pipeline_key *pl_key)
885
{
886
if (nir->info.stage == MESA_SHADER_VERTEX) {
887
if (info->vs.as_ls) {
888
ac_nir_lower_ls_outputs_to_mem(nir, info->vs.tcs_in_out_eq,
889
info->vs.tcs_temp_only_input_mask,
890
info->vs.num_linked_outputs);
891
return true;
892
} else if (info->vs.as_es) {
893
ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,
894
info->vs.num_linked_outputs);
895
return true;
896
}
897
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
898
ac_nir_lower_hs_inputs_to_mem(nir, info->vs.tcs_in_out_eq, info->tcs.num_linked_inputs);
899
ac_nir_lower_hs_outputs_to_mem(
900
nir, device->physical_device->rad_info.chip_class, info->tcs.tes_reads_tess_factors,
901
info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->tcs.num_linked_inputs,
902
info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs, true);
903
ac_nir_lower_tess_to_const(nir, pl_key->tess_input_vertices, info->num_tess_patches,
904
ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);
905
906
return true;
907
} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
908
ac_nir_lower_tes_inputs_to_mem(nir, info->tes.num_linked_inputs,
909
info->tes.num_linked_patch_inputs);
910
ac_nir_lower_tess_to_const(nir, nir->info.tess.tcs_vertices_out, info->num_tess_patches,
911
ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);
912
913
if (info->tes.as_es) {
914
ac_nir_lower_es_outputs_to_mem(nir, device->physical_device->rad_info.chip_class,
915
info->tes.num_linked_outputs);
916
}
917
918
return true;
919
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
920
ac_nir_lower_gs_inputs_to_mem(nir, device->physical_device->rad_info.chip_class,
921
info->gs.num_linked_inputs);
922
return true;
923
}
924
925
return false;
926
}
927
928
bool
929
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
930
uint64_t ps_inputs_read)
931
{
932
/* Culling doesn't make sense for meta shaders. */
933
if (!!nir->info.name)
934
return false;
935
936
/* We don't support culling with multiple viewports yet. */
937
if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))
938
return false;
939
940
/* TODO: enable by default on GFX10.3 when we're confident about performance. */
941
bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
942
943
if (!culling_enabled)
944
return false;
945
946
/* Shader based culling efficiency can depend on PS throughput.
947
* Estimate an upper limit for PS input param count based on GPU info.
948
*/
949
unsigned max_ps_params;
950
unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
951
unsigned max_se = device->physical_device->rad_info.max_se;
952
953
if (max_render_backends < 2)
954
return false; /* Don't use NGG culling on 1 RB chips. */
955
else if (max_render_backends / max_se == 4)
956
max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
957
else
958
max_ps_params = 4; /* Navi 1x. */
959
960
/* TODO: consider other heuristics here, such as PS execution time */
961
962
return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
963
}
964
965
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
966
struct radv_shader_info *info,
967
const struct radv_pipeline_key *pl_key,
968
struct radv_shader_variant_key *key,
969
bool consider_culling)
970
{
971
/* TODO: support the LLVM backend with the NIR lowering */
972
assert(!radv_use_llvm_for_stage(device, nir->info.stage));
973
974
assert(nir->info.stage == MESA_SHADER_VERTEX ||
975
nir->info.stage == MESA_SHADER_TESS_EVAL ||
976
nir->info.stage == MESA_SHADER_GEOMETRY);
977
978
ac_nir_ngg_config out_conf = {0};
979
const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
980
unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
981
unsigned num_vertices_per_prim = 3;
982
983
/* Get the number of vertices per input primitive */
984
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
985
if (nir->info.tess.point_mode)
986
num_vertices_per_prim = 1;
987
else if (nir->info.tess.primitive_mode == GL_ISOLINES)
988
num_vertices_per_prim = 2;
989
990
/* Manually mark the primitive ID used, so the shader can repack it. */
991
if (key->vs_common_out.export_prim_id)
992
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
993
994
} else if (nir->info.stage == MESA_SHADER_VERTEX) {
995
/* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
996
num_vertices_per_prim = key->vs.outprim + 1;
997
998
/* Manually mark the instance ID used, so the shader can repack it. */
999
if (key->vs.instance_rate_inputs)
1000
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1001
1002
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1003
num_vertices_per_prim = nir->info.gs.vertices_in;
1004
} else {
1005
unreachable("NGG needs to be VS, TES or GS.");
1006
}
1007
1008
/* Invocations that process an input vertex */
1009
unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
1010
/* Invocations that export an output vertex */
1011
unsigned max_vtx_out = ngg_info->max_out_verts;
1012
/* Invocations that process an input primitive */
1013
unsigned max_prm_in = ngg_info->max_gsprims * num_gs_invocations;
1014
/* Invocations that produce an output primitive */
1015
unsigned max_prm_out = ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor;
1016
1017
unsigned max_workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prm_in, max_prm_out);
1018
1019
/* Maximum HW limit for NGG workgroups */
1020
max_workgroup_size = MIN2(256, max_workgroup_size);
1021
1022
if (nir->info.stage == MESA_SHADER_VERTEX ||
1023
nir->info.stage == MESA_SHADER_TESS_EVAL) {
1024
assert(key->vs_common_out.as_ngg);
1025
1026
if (consider_culling)
1027
radv_optimize_nir_algebraic(nir, false);
1028
1029
out_conf =
1030
ac_nir_lower_ngg_nogs(
1031
nir,
1032
max_vtx_in,
1033
num_vertices_per_prim,
1034
max_workgroup_size,
1035
info->wave_size,
1036
consider_culling,
1037
key->vs_common_out.as_ngg_passthrough,
1038
key->vs_common_out.export_prim_id,
1039
key->vs.provoking_vtx_last);
1040
1041
info->has_ngg_culling = out_conf.can_cull;
1042
info->has_ngg_early_prim_export = out_conf.early_prim_export;
1043
info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
1044
info->is_ngg_passthrough = out_conf.passthrough;
1045
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
1046
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1047
assert(info->is_ngg);
1048
ac_nir_lower_ngg_gs(
1049
nir, info->wave_size, max_workgroup_size,
1050
info->ngg_info.esgs_ring_size,
1051
info->gs.gsvs_vertex_size,
1052
info->ngg_info.ngg_emit_size * 4u,
1053
key->vs.provoking_vtx_last);
1054
} else {
1055
unreachable("invalid SW stage passed to radv_lower_ngg");
1056
}
1057
}
1058
1059
static void *
1060
radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
1061
{
1062
mtx_lock(&device->shader_slab_mutex);
1063
list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
1064
{
1065
uint64_t offset = 0;
1066
1067
#ifdef __GNUC__
1068
#pragma GCC diagnostic push
1069
#pragma GCC diagnostic ignored "-Wshadow"
1070
#endif
1071
list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
1072
{
1073
#ifdef __GNUC__
1074
#pragma GCC diagnostic pop
1075
#endif
1076
if (s->bo_offset - offset >= shader->code_size) {
1077
shader->bo = slab->bo;
1078
shader->bo_offset = offset;
1079
list_addtail(&shader->slab_list, &s->slab_list);
1080
mtx_unlock(&device->shader_slab_mutex);
1081
return slab->ptr + offset;
1082
}
1083
offset = align_u64(s->bo_offset + s->code_size, 256);
1084
}
1085
if (offset <= slab->size && slab->size - offset >= shader->code_size) {
1086
shader->bo = slab->bo;
1087
shader->bo_offset = offset;
1088
list_addtail(&shader->slab_list, &slab->shaders);
1089
mtx_unlock(&device->shader_slab_mutex);
1090
return slab->ptr + offset;
1091
}
1092
}
1093
1094
mtx_unlock(&device->shader_slab_mutex);
1095
struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));
1096
1097
slab->size = MAX2(256 * 1024, shader->code_size);
1098
VkResult result = device->ws->buffer_create(
1099
device->ws, slab->size, 256, RADEON_DOMAIN_VRAM,
1100
RADEON_FLAG_NO_INTERPROCESS_SHARING |
1101
(device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
1102
: RADEON_FLAG_READ_ONLY),
1103
RADV_BO_PRIORITY_SHADER, 0, &slab->bo);
1104
if (result != VK_SUCCESS) {
1105
free(slab);
1106
return NULL;
1107
}
1108
1109
slab->ptr = (char *)device->ws->buffer_map(slab->bo);
1110
if (!slab->ptr) {
1111
device->ws->buffer_destroy(device->ws, slab->bo);
1112
free(slab);
1113
return NULL;
1114
}
1115
1116
list_inithead(&slab->shaders);
1117
1118
mtx_lock(&device->shader_slab_mutex);
1119
list_add(&slab->slabs, &device->shader_slabs);
1120
1121
shader->bo = slab->bo;
1122
shader->bo_offset = 0;
1123
list_add(&shader->slab_list, &slab->shaders);
1124
mtx_unlock(&device->shader_slab_mutex);
1125
return slab->ptr;
1126
}
1127
1128
void
1129
radv_destroy_shader_slabs(struct radv_device *device)
1130
{
1131
list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
1132
{
1133
device->ws->buffer_destroy(device->ws, slab->bo);
1134
free(slab);
1135
}
1136
mtx_destroy(&device->shader_slab_mutex);
1137
}
1138
1139
/* For the UMR disassembler. */
1140
#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
1141
#define DEBUGGER_NUM_MARKERS 5
1142
1143
static unsigned
1144
radv_get_shader_binary_size(size_t code_size)
1145
{
1146
return code_size + DEBUGGER_NUM_MARKERS * 4;
1147
}
1148
1149
static bool
1150
radv_should_use_wgp_mode(const struct radv_device *device, gl_shader_stage stage,
1151
const struct radv_shader_info *info)
1152
{
1153
enum chip_class chip = device->physical_device->rad_info.chip_class;
1154
switch (stage) {
1155
case MESA_SHADER_COMPUTE:
1156
case MESA_SHADER_TESS_CTRL:
1157
return chip >= GFX10;
1158
case MESA_SHADER_GEOMETRY:
1159
return chip == GFX10 || (chip >= GFX10_3 && !info->is_ngg);
1160
case MESA_SHADER_VERTEX:
1161
case MESA_SHADER_TESS_EVAL:
1162
return chip == GFX10 && info->is_ngg;
1163
default:
1164
return false;
1165
}
1166
}
1167
1168
static void
1169
radv_postprocess_config(const struct radv_device *device, const struct ac_shader_config *config_in,
1170
const struct radv_shader_info *info, gl_shader_stage stage,
1171
struct ac_shader_config *config_out)
1172
{
1173
const struct radv_physical_device *pdevice = device->physical_device;
1174
bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
1175
bool trap_enabled = !!device->trap_handler_shader;
1176
unsigned vgpr_comp_cnt = 0;
1177
unsigned num_input_vgprs = info->num_input_vgprs;
1178
1179
if (stage == MESA_SHADER_FRAGMENT) {
1180
num_input_vgprs = ac_get_fs_input_vgpr_cnt(config_in, NULL, NULL);
1181
}
1182
1183
unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);
1184
/* +3 for scratch wave offset and VCC */
1185
unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);
1186
unsigned num_shared_vgprs = config_in->num_shared_vgprs;
1187
/* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */
1188
assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0) ||
1189
(pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));
1190
unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;
1191
unsigned excp_en = 0;
1192
1193
*config_out = *config_in;
1194
config_out->num_vgprs = num_vgprs;
1195
config_out->num_sgprs = num_sgprs;
1196
config_out->num_shared_vgprs = num_shared_vgprs;
1197
1198
config_out->rsrc2 = S_00B12C_USER_SGPR(info->num_user_sgprs) |
1199
S_00B12C_SCRATCH_EN(scratch_enabled) | S_00B12C_TRAP_PRESENT(trap_enabled);
1200
1201
if (trap_enabled) {
1202
/* Configure the shader exceptions like memory violation, etc.
1203
* TODO: Enable (and validate) more exceptions.
1204
*/
1205
excp_en = 1 << 8; /* mem_viol */
1206
}
1207
1208
if (!pdevice->use_ngg_streamout) {
1209
config_out->rsrc2 |=
1210
S_00B12C_SO_BASE0_EN(!!info->so.strides[0]) | S_00B12C_SO_BASE1_EN(!!info->so.strides[1]) |
1211
S_00B12C_SO_BASE2_EN(!!info->so.strides[2]) | S_00B12C_SO_BASE3_EN(!!info->so.strides[3]) |
1212
S_00B12C_SO_EN(!!info->so.num_outputs);
1213
}
1214
1215
config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / (info->wave_size == 32 ? 8 : 4)) |
1216
S_00B848_DX10_CLAMP(1) | S_00B848_FLOAT_MODE(config_out->float_mode);
1217
1218
if (pdevice->rad_info.chip_class >= GFX10) {
1219
config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(info->num_user_sgprs >> 5);
1220
} else {
1221
config_out->rsrc1 |= S_00B228_SGPRS((num_sgprs - 1) / 8);
1222
config_out->rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(info->num_user_sgprs >> 5);
1223
}
1224
1225
bool wgp_mode = radv_should_use_wgp_mode(device, stage, info);
1226
1227
switch (stage) {
1228
case MESA_SHADER_TESS_EVAL:
1229
if (info->is_ngg) {
1230
config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1231
config_out->rsrc2 |= S_00B22C_OC_LDS_EN(1) | S_00B22C_EXCP_EN(excp_en);
1232
} else if (info->tes.as_es) {
1233
assert(pdevice->rad_info.chip_class <= GFX8);
1234
vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
1235
1236
config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1237
} else {
1238
bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
1239
vgpr_comp_cnt = enable_prim_id ? 3 : 2;
1240
1241
config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1242
config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1243
}
1244
config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1245
break;
1246
case MESA_SHADER_TESS_CTRL:
1247
if (pdevice->rad_info.chip_class >= GFX9) {
1248
/* We need at least 2 components for LS.
1249
* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
1250
* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1251
*/
1252
if (pdevice->rad_info.chip_class >= GFX10) {
1253
vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 1;
1254
config_out->rsrc2 |=
1255
S_00B42C_LDS_SIZE_GFX10(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX6(excp_en);
1256
} else {
1257
vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
1258
config_out->rsrc2 |=
1259
S_00B42C_LDS_SIZE_GFX9(info->tcs.num_lds_blocks) | S_00B42C_EXCP_EN_GFX9(excp_en);
1260
}
1261
} else {
1262
config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
1263
}
1264
config_out->rsrc1 |=
1265
S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B428_WGP_MODE(wgp_mode);
1266
config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1267
break;
1268
case MESA_SHADER_VERTEX:
1269
if (info->is_ngg) {
1270
config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1271
} else if (info->vs.as_ls) {
1272
assert(pdevice->rad_info.chip_class <= GFX8);
1273
/* We need at least 2 components for LS.
1274
* VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, InstanceID).
1275
* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1276
*/
1277
vgpr_comp_cnt = info->vs.needs_instance_id ? 2 : 1;
1278
} else if (info->vs.as_es) {
1279
assert(pdevice->rad_info.chip_class <= GFX8);
1280
/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
1281
vgpr_comp_cnt = info->vs.needs_instance_id ? 1 : 0;
1282
} else {
1283
/* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, InstanceID)
1284
* If PrimID is disabled. InstanceID / StepRate1 is loaded instead.
1285
* StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded.
1286
*/
1287
if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {
1288
vgpr_comp_cnt = 3;
1289
} else if (info->vs.export_prim_id) {
1290
vgpr_comp_cnt = 2;
1291
} else if (info->vs.needs_instance_id) {
1292
vgpr_comp_cnt = 1;
1293
} else {
1294
vgpr_comp_cnt = 0;
1295
}
1296
1297
config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1298
}
1299
config_out->rsrc2 |=
1300
S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B12C_EXCP_EN(excp_en);
1301
break;
1302
case MESA_SHADER_FRAGMENT:
1303
config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1304
config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
1305
S_00B02C_TRAP_PRESENT(1) | S_00B02C_EXCP_EN(excp_en);
1306
break;
1307
case MESA_SHADER_GEOMETRY:
1308
config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
1309
config_out->rsrc2 |=
1310
S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) | S_00B22C_EXCP_EN(excp_en);
1311
break;
1312
case MESA_SHADER_COMPUTE:
1313
config_out->rsrc1 |=
1314
S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) | S_00B848_WGP_MODE(wgp_mode);
1315
config_out->rsrc2 |= S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
1316
S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
1317
S_00B84C_TGID_Z_EN(info->cs.uses_block_id[2]) |
1318
S_00B84C_TIDIG_COMP_CNT(info->cs.uses_thread_id[2] ? 2
1319
: info->cs.uses_thread_id[1] ? 1
1320
: 0) |
1321
S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
1322
S_00B84C_LDS_SIZE(config_in->lds_size) | S_00B84C_EXCP_EN(excp_en);
1323
config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
1324
1325
break;
1326
default:
1327
unreachable("unsupported shader type");
1328
break;
1329
}
1330
1331
if (pdevice->rad_info.chip_class >= GFX10 && info->is_ngg &&
1332
(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL ||
1333
stage == MESA_SHADER_GEOMETRY)) {
1334
unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
1335
gl_shader_stage es_stage = stage;
1336
if (stage == MESA_SHADER_GEOMETRY)
1337
es_stage = info->gs.es_type;
1338
1339
/* VGPR5-8: (VertexID, UserVGPR0, UserVGPR1, UserVGPR2 / InstanceID) */
1340
if (es_stage == MESA_SHADER_VERTEX) {
1341
es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;
1342
} else if (es_stage == MESA_SHADER_TESS_EVAL) {
1343
bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
1344
es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;
1345
} else
1346
unreachable("Unexpected ES shader stage");
1347
1348
bool tes_triangles =
1349
stage == MESA_SHADER_TESS_EVAL && info->tes.primitive_mode >= 4; /* GL_TRIANGLES */
1350
if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {
1351
gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1352
} else if (info->uses_prim_id) {
1353
gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1354
} else if (info->gs.vertices_in >= 3 || tes_triangles) {
1355
gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1356
} else {
1357
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1358
}
1359
1360
/* Disable the WGP mode on gfx10.3 because it can hang. (it
1361
* happened on VanGogh) Let's disable it on all chips that
1362
* disable exactly 1 CU per SA for GS.
1363
*/
1364
config_out->rsrc1 |=
1365
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);
1366
config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1367
S_00B22C_LDS_SIZE(config_in->lds_size) |
1368
S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL);
1369
} else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_GEOMETRY) {
1370
unsigned es_type = info->gs.es_type;
1371
unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
1372
1373
if (es_type == MESA_SHADER_VERTEX) {
1374
/* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */
1375
if (info->vs.needs_instance_id) {
1376
es_vgpr_comp_cnt = pdevice->rad_info.chip_class >= GFX10 ? 3 : 1;
1377
} else {
1378
es_vgpr_comp_cnt = 0;
1379
}
1380
} else if (es_type == MESA_SHADER_TESS_EVAL) {
1381
es_vgpr_comp_cnt = info->uses_prim_id ? 3 : 2;
1382
} else {
1383
unreachable("invalid shader ES type");
1384
}
1385
1386
/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
1387
* VGPR[0:4] are always loaded.
1388
*/
1389
if (info->uses_invocation_id) {
1390
gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
1391
} else if (info->uses_prim_id) {
1392
gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
1393
} else if (info->gs.vertices_in >= 3) {
1394
gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
1395
} else {
1396
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
1397
}
1398
1399
config_out->rsrc1 |=
1400
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt) | S_00B228_WGP_MODE(wgp_mode);
1401
config_out->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
1402
S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
1403
} else if (pdevice->rad_info.chip_class >= GFX9 && stage == MESA_SHADER_TESS_CTRL) {
1404
config_out->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
1405
} else {
1406
config_out->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
1407
}
1408
}
1409
1410
struct radv_shader_variant *
1411
radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary,
1412
bool keep_shader_info)
1413
{
1414
struct ac_shader_config config = {0};
1415
struct ac_rtld_binary rtld_binary = {0};
1416
struct radv_shader_variant *variant = calloc(1, sizeof(struct radv_shader_variant));
1417
if (!variant)
1418
return NULL;
1419
1420
variant->ref_count = 1;
1421
1422
if (binary->type == RADV_BINARY_TYPE_RTLD) {
1423
struct ac_rtld_symbol lds_symbols[2];
1424
unsigned num_lds_symbols = 0;
1425
const char *elf_data = (const char *)((struct radv_shader_binary_rtld *)binary)->data;
1426
size_t elf_size = ((struct radv_shader_binary_rtld *)binary)->elf_size;
1427
1428
if (device->physical_device->rad_info.chip_class >= GFX9 &&
1429
(binary->stage == MESA_SHADER_GEOMETRY || binary->info.is_ngg) &&
1430
!binary->is_gs_copy_shader) {
1431
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1432
sym->name = "esgs_ring";
1433
sym->size = binary->info.ngg_info.esgs_ring_size;
1434
sym->align = 64 * 1024;
1435
}
1436
1437
if (binary->info.is_ngg && binary->stage == MESA_SHADER_GEOMETRY) {
1438
struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1439
sym->name = "ngg_emit";
1440
sym->size = binary->info.ngg_info.ngg_emit_size * 4;
1441
sym->align = 4;
1442
}
1443
1444
struct ac_rtld_open_info open_info = {
1445
.info = &device->physical_device->rad_info,
1446
.shader_type = binary->stage,
1447
.wave_size = binary->info.wave_size,
1448
.num_parts = 1,
1449
.elf_ptrs = &elf_data,
1450
.elf_sizes = &elf_size,
1451
.num_shared_lds_symbols = num_lds_symbols,
1452
.shared_lds_symbols = lds_symbols,
1453
};
1454
1455
if (!ac_rtld_open(&rtld_binary, open_info)) {
1456
free(variant);
1457
return NULL;
1458
}
1459
1460
if (!ac_rtld_read_config(&device->physical_device->rad_info, &rtld_binary, &config)) {
1461
ac_rtld_close(&rtld_binary);
1462
free(variant);
1463
return NULL;
1464
}
1465
1466
if (rtld_binary.lds_size > 0) {
1467
unsigned encode_granularity = device->physical_device->rad_info.lds_encode_granularity;
1468
config.lds_size = align(rtld_binary.lds_size, encode_granularity) / encode_granularity;
1469
}
1470
if (!config.lds_size && binary->stage == MESA_SHADER_TESS_CTRL) {
1471
/* This is used for reporting LDS statistics */
1472
config.lds_size = binary->info.tcs.num_lds_blocks;
1473
}
1474
1475
variant->code_size = rtld_binary.rx_size;
1476
variant->exec_size = rtld_binary.exec_size;
1477
} else {
1478
assert(binary->type == RADV_BINARY_TYPE_LEGACY);
1479
config = ((struct radv_shader_binary_legacy *)binary)->config;
1480
variant->code_size =
1481
radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);
1482
variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;
1483
}
1484
1485
variant->info = binary->info;
1486
radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config);
1487
1488
void *dest_ptr = radv_alloc_shader_memory(device, variant);
1489
if (!dest_ptr) {
1490
if (binary->type == RADV_BINARY_TYPE_RTLD)
1491
ac_rtld_close(&rtld_binary);
1492
free(variant);
1493
return NULL;
1494
}
1495
1496
if (binary->type == RADV_BINARY_TYPE_RTLD) {
1497
struct radv_shader_binary_rtld *bin = (struct radv_shader_binary_rtld *)binary;
1498
struct ac_rtld_upload_info info = {
1499
.binary = &rtld_binary,
1500
.rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,
1501
.rx_ptr = dest_ptr,
1502
};
1503
1504
if (!ac_rtld_upload(&info)) {
1505
radv_shader_variant_destroy(device, variant);
1506
ac_rtld_close(&rtld_binary);
1507
return NULL;
1508
}
1509
1510
if (keep_shader_info || (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS)) {
1511
const char *disasm_data;
1512
size_t disasm_size;
1513
if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm_data,
1514
&disasm_size)) {
1515
radv_shader_variant_destroy(device, variant);
1516
ac_rtld_close(&rtld_binary);
1517
return NULL;
1518
}
1519
1520
variant->ir_string =
1521
bin->llvm_ir_size ? strdup((const char *)(bin->data + bin->elf_size)) : NULL;
1522
variant->disasm_string = malloc(disasm_size + 1);
1523
memcpy(variant->disasm_string, disasm_data, disasm_size);
1524
variant->disasm_string[disasm_size] = 0;
1525
}
1526
1527
variant->code_ptr = dest_ptr;
1528
ac_rtld_close(&rtld_binary);
1529
} else {
1530
struct radv_shader_binary_legacy *bin = (struct radv_shader_binary_legacy *)binary;
1531
memcpy(dest_ptr, bin->data + bin->stats_size, bin->code_size);
1532
1533
/* Add end-of-code markers for the UMR disassembler. */
1534
uint32_t *ptr32 = (uint32_t *)dest_ptr + bin->code_size / 4;
1535
for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
1536
ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
1537
1538
variant->code_ptr = dest_ptr;
1539
variant->ir_string =
1540
bin->ir_size ? strdup((const char *)(bin->data + bin->stats_size + bin->code_size)) : NULL;
1541
variant->disasm_string =
1542
bin->disasm_size
1543
? strdup((const char *)(bin->data + bin->stats_size + bin->code_size + bin->ir_size))
1544
: NULL;
1545
1546
if (bin->stats_size) {
1547
variant->statistics = calloc(bin->stats_size, 1);
1548
memcpy(variant->statistics, bin->data, bin->stats_size);
1549
}
1550
}
1551
return variant;
1552
}
1553
1554
static char *
1555
radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)
1556
{
1557
char *data = NULL;
1558
char *ret = NULL;
1559
size_t size = 0;
1560
struct u_memstream mem;
1561
if (u_memstream_open(&mem, &data, &size)) {
1562
FILE *const memf = u_memstream_get(&mem);
1563
for (int i = 0; i < shader_count; ++i)
1564
nir_print_shader(shaders[i], memf);
1565
u_memstream_close(&mem);
1566
}
1567
1568
ret = malloc(size + 1);
1569
if (ret) {
1570
memcpy(ret, data, size);
1571
ret[size] = 0;
1572
}
1573
free(data);
1574
return ret;
1575
}
1576
1577
static struct radv_shader_variant *
1578
shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,
1579
struct nir_shader *const *shaders, int shader_count, gl_shader_stage stage,
1580
struct radv_shader_info *info, struct radv_nir_compiler_options *options,
1581
bool gs_copy_shader, bool trap_handler_shader, bool keep_shader_info,
1582
bool keep_statistic_info, struct radv_shader_binary **binary_out)
1583
{
1584
enum radeon_family chip_family = device->physical_device->rad_info.family;
1585
struct radv_shader_binary *binary = NULL;
1586
1587
struct radv_shader_debug_data debug_data = {
1588
.device = device,
1589
.module = module,
1590
};
1591
1592
options->family = chip_family;
1593
options->chip_class = device->physical_device->rad_info.chip_class;
1594
options->info = &device->physical_device->rad_info;
1595
options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader || trap_handler_shader);
1596
options->dump_preoptir =
1597
options->dump_shader && device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
1598
options->record_ir = keep_shader_info;
1599
options->record_stats = keep_statistic_info;
1600
options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
1601
options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
1602
options->address32_hi = device->physical_device->rad_info.address32_hi;
1603
options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;
1604
options->use_ngg_streamout = device->physical_device->use_ngg_streamout;
1605
options->enable_mrt_output_nan_fixup =
1606
module && !module->nir && device->instance->enable_mrt_output_nan_fixup;
1607
options->adjust_frag_coord_z = device->adjust_frag_coord_z;
1608
options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
1609
options->debug.func = radv_compiler_debug;
1610
options->debug.private_data = &debug_data;
1611
1612
switch (device->force_vrs) {
1613
case RADV_FORCE_VRS_2x2:
1614
options->force_vrs_rates = (1u << 2) | (1u << 4);
1615
break;
1616
case RADV_FORCE_VRS_2x1:
1617
options->force_vrs_rates = (1u << 2) | (0u << 4);
1618
break;
1619
case RADV_FORCE_VRS_1x2:
1620
options->force_vrs_rates = (0u << 2) | (1u << 4);
1621
break;
1622
default:
1623
break;
1624
}
1625
1626
struct radv_shader_args args = {0};
1627
args.options = options;
1628
args.shader_info = info;
1629
args.is_gs_copy_shader = gs_copy_shader;
1630
args.is_trap_handler_shader = trap_handler_shader;
1631
1632
radv_declare_shader_args(
1633
&args, gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage,
1634
shader_count >= 2,
1635
shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);
1636
1637
if (radv_use_llvm_for_stage(device, stage) || options->dump_shader || options->record_ir)
1638
ac_init_llvm_once();
1639
1640
if (radv_use_llvm_for_stage(device, stage)) {
1641
llvm_compile_shader(device, shader_count, shaders, &binary, &args);
1642
} else {
1643
aco_compile_shader(shader_count, shaders, &binary, &args);
1644
}
1645
1646
binary->info = *info;
1647
1648
struct radv_shader_variant *variant =
1649
radv_shader_variant_create(device, binary, keep_shader_info);
1650
if (!variant) {
1651
free(binary);
1652
return NULL;
1653
}
1654
1655
if (options->dump_shader) {
1656
fprintf(stderr, "%s", radv_get_shader_name(info, shaders[0]->info.stage));
1657
for (int i = 1; i < shader_count; ++i)
1658
fprintf(stderr, " + %s", radv_get_shader_name(info, shaders[i]->info.stage));
1659
1660
fprintf(stderr, "\ndisasm:\n%s\n", variant->disasm_string);
1661
}
1662
1663
if (keep_shader_info) {
1664
variant->nir_string = radv_dump_nir_shaders(shaders, shader_count);
1665
if (!gs_copy_shader && !trap_handler_shader && !module->nir) {
1666
variant->spirv = malloc(module->size);
1667
if (!variant->spirv) {
1668
free(variant);
1669
free(binary);
1670
return NULL;
1671
}
1672
1673
memcpy(variant->spirv, module->data, module->size);
1674
variant->spirv_size = module->size;
1675
}
1676
}
1677
1678
if (binary_out)
1679
*binary_out = binary;
1680
else
1681
free(binary);
1682
1683
return variant;
1684
}
1685
1686
struct radv_shader_variant *
1687
radv_shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,
1688
struct nir_shader *const *shaders, int shader_count,
1689
struct radv_pipeline_layout *layout,
1690
const struct radv_shader_variant_key *key,
1691
struct radv_shader_info *info, bool keep_shader_info,
1692
bool keep_statistic_info, bool disable_optimizations,
1693
struct radv_shader_binary **binary_out)
1694
{
1695
gl_shader_stage stage = shaders[shader_count - 1]->info.stage;
1696
struct radv_nir_compiler_options options = {0};
1697
1698
options.layout = layout;
1699
if (key)
1700
options.key = *key;
1701
1702
options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1703
options.robust_buffer_access = device->robust_buffer_access;
1704
options.disable_optimizations = disable_optimizations;
1705
options.wgp_mode = radv_should_use_wgp_mode(device, stage, info);
1706
1707
return shader_variant_compile(device, module, shaders, shader_count, stage, info, &options,
1708
false, false, keep_shader_info, keep_statistic_info, binary_out);
1709
}
1710
1711
struct radv_shader_variant *
1712
radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader,
1713
struct radv_shader_info *info, struct radv_shader_binary **binary_out,
1714
bool keep_shader_info, bool keep_statistic_info, bool multiview,
1715
bool disable_optimizations)
1716
{
1717
struct radv_nir_compiler_options options = {0};
1718
gl_shader_stage stage = MESA_SHADER_VERTEX;
1719
1720
options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
1721
options.key.has_multiview_view_index = multiview;
1722
options.disable_optimizations = disable_optimizations;
1723
1724
return shader_variant_compile(device, NULL, &shader, 1, stage, info, &options, true, false,
1725
keep_shader_info, keep_statistic_info, binary_out);
1726
}
1727
1728
struct radv_shader_variant *
1729
radv_create_trap_handler_shader(struct radv_device *device)
1730
{
1731
struct radv_nir_compiler_options options = {0};
1732
struct radv_shader_variant *shader = NULL;
1733
struct radv_shader_binary *binary = NULL;
1734
struct radv_shader_info info = {0};
1735
1736
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "meta_trap_handler");
1737
1738
options.explicit_scratch_args = true;
1739
options.wgp_mode = radv_should_use_wgp_mode(device, MESA_SHADER_COMPUTE, &info);
1740
info.wave_size = 64;
1741
1742
shader = shader_variant_compile(device, NULL, &b.shader, 1, MESA_SHADER_COMPUTE, &info, &options,
1743
false, true, true, false, &binary);
1744
1745
ralloc_free(b.shader);
1746
free(binary);
1747
1748
return shader;
1749
}
1750
1751
void
1752
radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)
1753
{
1754
if (!p_atomic_dec_zero(&variant->ref_count))
1755
return;
1756
1757
mtx_lock(&device->shader_slab_mutex);
1758
list_del(&variant->slab_list);
1759
mtx_unlock(&device->shader_slab_mutex);
1760
1761
free(variant->spirv);
1762
free(variant->nir_string);
1763
free(variant->disasm_string);
1764
free(variant->ir_string);
1765
free(variant->statistics);
1766
free(variant);
1767
}
1768
1769
const char *
1770
radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage)
1771
{
1772
switch (stage) {
1773
case MESA_SHADER_VERTEX:
1774
if (info->vs.as_ls)
1775
return "Vertex Shader as LS";
1776
else if (info->vs.as_es)
1777
return "Vertex Shader as ES";
1778
else if (info->is_ngg)
1779
return "Vertex Shader as ESGS";
1780
else
1781
return "Vertex Shader as VS";
1782
case MESA_SHADER_TESS_CTRL:
1783
return "Tessellation Control Shader";
1784
case MESA_SHADER_TESS_EVAL:
1785
if (info->tes.as_es)
1786
return "Tessellation Evaluation Shader as ES";
1787
else if (info->is_ngg)
1788
return "Tessellation Evaluation Shader as ESGS";
1789
else
1790
return "Tessellation Evaluation Shader as VS";
1791
case MESA_SHADER_GEOMETRY:
1792
return "Geometry Shader";
1793
case MESA_SHADER_FRAGMENT:
1794
return "Pixel Shader";
1795
case MESA_SHADER_COMPUTE:
1796
return "Compute Shader";
1797
default:
1798
return "Unknown shader";
1799
};
1800
}
1801
1802
unsigned
1803
radv_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
1804
const unsigned *sizes)
1805
{
1806
switch (stage) {
1807
case MESA_SHADER_TESS_CTRL:
1808
return chip_class >= GFX7 ? 128 : 64;
1809
case MESA_SHADER_GEOMETRY:
1810
return chip_class >= GFX9 ? 128 : 64;
1811
case MESA_SHADER_COMPUTE:
1812
break;
1813
default:
1814
return 0;
1815
}
1816
1817
unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];
1818
return max_workgroup_size;
1819
}
1820
1821
unsigned
1822
radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *variant,
1823
gl_shader_stage stage)
1824
{
1825
struct radeon_info *info = &device->physical_device->rad_info;
1826
enum chip_class chip_class = info->chip_class;
1827
uint8_t wave_size = variant->info.wave_size;
1828
struct ac_shader_config *conf = &variant->config;
1829
unsigned max_simd_waves;
1830
unsigned lds_per_wave = 0;
1831
1832
max_simd_waves = info->max_wave64_per_simd * (64 / wave_size);
1833
1834
if (stage == MESA_SHADER_FRAGMENT) {
1835
lds_per_wave =
1836
conf->lds_size * info->lds_encode_granularity + variant->info.ps.num_interp * 48;
1837
lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity);
1838
} else if (stage == MESA_SHADER_COMPUTE) {
1839
unsigned max_workgroup_size =
1840
radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);
1841
lds_per_wave =
1842
align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity);
1843
lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size);
1844
}
1845
1846
if (conf->num_sgprs && chip_class < GFX10) {
1847
unsigned sgprs = align(conf->num_sgprs, chip_class >= GFX8 ? 16 : 8);
1848
max_simd_waves = MIN2(max_simd_waves, info->num_physical_sgprs_per_simd / sgprs);
1849
}
1850
1851
if (conf->num_vgprs) {
1852
unsigned physical_vgprs = info->num_physical_wave64_vgprs_per_simd * (64 / wave_size);
1853
unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);
1854
if (chip_class >= GFX10_3)
1855
vgprs = align(vgprs, wave_size == 32 ? 16 : 8);
1856
max_simd_waves = MIN2(max_simd_waves, physical_vgprs / vgprs);
1857
}
1858
1859
unsigned simd_per_workgroup = info->num_simd_per_compute_unit;
1860
if (chip_class >= GFX10)
1861
simd_per_workgroup *= 2; /* like lds_size_per_workgroup, assume WGP on GFX10+ */
1862
1863
unsigned max_lds_per_simd = info->lds_size_per_workgroup / simd_per_workgroup;
1864
if (lds_per_wave)
1865
max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_lds_per_simd, lds_per_wave));
1866
1867
return chip_class >= GFX10 ? max_simd_waves * (wave_size / 32) : max_simd_waves;
1868
}
1869
1870
VkResult
1871
radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagBits shaderStage,
1872
VkShaderInfoTypeAMD infoType, size_t *pInfoSize, void *pInfo)
1873
{
1874
RADV_FROM_HANDLE(radv_device, device, _device);
1875
RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
1876
gl_shader_stage stage = vk_to_mesa_shader_stage(shaderStage);
1877
struct radv_shader_variant *variant = pipeline->shaders[stage];
1878
VkResult result = VK_SUCCESS;
1879
1880
/* Spec doesn't indicate what to do if the stage is invalid, so just
1881
* return no info for this. */
1882
if (!variant)
1883
return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
1884
1885
switch (infoType) {
1886
case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
1887
if (!pInfo) {
1888
*pInfoSize = sizeof(VkShaderStatisticsInfoAMD);
1889
} else {
1890
unsigned lds_multiplier = device->physical_device->rad_info.lds_encode_granularity;
1891
struct ac_shader_config *conf = &variant->config;
1892
1893
VkShaderStatisticsInfoAMD statistics = {0};
1894
statistics.shaderStageMask = shaderStage;
1895
statistics.numPhysicalVgprs =
1896
device->physical_device->rad_info.num_physical_wave64_vgprs_per_simd;
1897
statistics.numPhysicalSgprs =
1898
device->physical_device->rad_info.num_physical_sgprs_per_simd;
1899
statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
1900
1901
if (stage == MESA_SHADER_COMPUTE) {
1902
unsigned *local_size = variant->info.cs.block_size;
1903
unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];
1904
1905
statistics.numAvailableVgprs =
1906
statistics.numPhysicalVgprs /
1907
ceil((double)workgroup_size / statistics.numPhysicalVgprs);
1908
1909
statistics.computeWorkGroupSize[0] = local_size[0];
1910
statistics.computeWorkGroupSize[1] = local_size[1];
1911
statistics.computeWorkGroupSize[2] = local_size[2];
1912
} else {
1913
statistics.numAvailableVgprs = statistics.numPhysicalVgprs;
1914
}
1915
1916
statistics.resourceUsage.numUsedVgprs = conf->num_vgprs;
1917
statistics.resourceUsage.numUsedSgprs = conf->num_sgprs;
1918
statistics.resourceUsage.ldsSizePerLocalWorkGroup = 32768;
1919
statistics.resourceUsage.ldsUsageSizeInBytes = conf->lds_size * lds_multiplier;
1920
statistics.resourceUsage.scratchMemUsageInBytes = conf->scratch_bytes_per_wave;
1921
1922
size_t size = *pInfoSize;
1923
*pInfoSize = sizeof(statistics);
1924
1925
memcpy(pInfo, &statistics, MIN2(size, *pInfoSize));
1926
1927
if (size < *pInfoSize)
1928
result = VK_INCOMPLETE;
1929
}
1930
1931
break;
1932
case VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD: {
1933
char *out;
1934
size_t outsize;
1935
struct u_memstream mem;
1936
u_memstream_open(&mem, &out, &outsize);
1937
FILE *const memf = u_memstream_get(&mem);
1938
1939
fprintf(memf, "%s:\n", radv_get_shader_name(&variant->info, stage));
1940
fprintf(memf, "%s\n\n", variant->ir_string);
1941
fprintf(memf, "%s\n\n", variant->disasm_string);
1942
radv_dump_shader_stats(device, pipeline, stage, memf);
1943
u_memstream_close(&mem);
1944
1945
/* Need to include the null terminator. */
1946
size_t length = outsize + 1;
1947
1948
if (!pInfo) {
1949
*pInfoSize = length;
1950
} else {
1951
size_t size = *pInfoSize;
1952
*pInfoSize = length;
1953
1954
memcpy(pInfo, out, MIN2(size, length));
1955
1956
if (size < length)
1957
result = VK_INCOMPLETE;
1958
}
1959
1960
free(out);
1961
break;
1962
}
1963
default:
1964
/* VK_SHADER_INFO_TYPE_BINARY_AMD unimplemented for now. */
1965
result = VK_ERROR_FEATURE_NOT_PRESENT;
1966
break;
1967
}
1968
1969
return result;
1970
}
1971
1972
VkResult
1973
radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline,
1974
gl_shader_stage stage, FILE *output)
1975
{
1976
struct radv_shader_variant *shader = pipeline->shaders[stage];
1977
VkPipelineExecutablePropertiesKHR *props = NULL;
1978
uint32_t prop_count = 0;
1979
VkResult result;
1980
1981
VkPipelineInfoKHR pipeline_info = {0};
1982
pipeline_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR;
1983
pipeline_info.pipeline = radv_pipeline_to_handle(pipeline);
1984
1985
result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,
1986
&prop_count, NULL);
1987
if (result != VK_SUCCESS)
1988
return result;
1989
1990
props = calloc(prop_count, sizeof(*props));
1991
if (!props)
1992
return VK_ERROR_OUT_OF_HOST_MEMORY;
1993
1994
result = radv_GetPipelineExecutablePropertiesKHR(radv_device_to_handle(device), &pipeline_info,
1995
&prop_count, props);
1996
if (result != VK_SUCCESS)
1997
goto fail;
1998
1999
for (unsigned exec_idx = 0; exec_idx < prop_count; exec_idx++) {
2000
if (!(props[exec_idx].stages & mesa_to_vk_shader_stage(stage)))
2001
continue;
2002
2003
VkPipelineExecutableStatisticKHR *stats = NULL;
2004
uint32_t stat_count = 0;
2005
2006
VkPipelineExecutableInfoKHR exec_info = {0};
2007
exec_info.pipeline = radv_pipeline_to_handle(pipeline);
2008
exec_info.executableIndex = exec_idx;
2009
2010
result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,
2011
&stat_count, NULL);
2012
if (result != VK_SUCCESS)
2013
goto fail;
2014
2015
stats = calloc(stat_count, sizeof(*stats));
2016
if (!stats) {
2017
result = VK_ERROR_OUT_OF_HOST_MEMORY;
2018
goto fail;
2019
}
2020
2021
result = radv_GetPipelineExecutableStatisticsKHR(radv_device_to_handle(device), &exec_info,
2022
&stat_count, stats);
2023
if (result != VK_SUCCESS) {
2024
free(stats);
2025
goto fail;
2026
}
2027
2028
fprintf(output, "\n%s:\n", radv_get_shader_name(&shader->info, stage));
2029
fprintf(output, "*** SHADER STATS ***\n");
2030
2031
for (unsigned i = 0; i < stat_count; i++) {
2032
fprintf(output, "%s: ", stats[i].name);
2033
switch (stats[i].format) {
2034
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
2035
fprintf(output, "%s", stats[i].value.b32 == VK_TRUE ? "true" : "false");
2036
break;
2037
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
2038
fprintf(output, "%" PRIi64, stats[i].value.i64);
2039
break;
2040
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
2041
fprintf(output, "%" PRIu64, stats[i].value.u64);
2042
break;
2043
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
2044
fprintf(output, "%f", stats[i].value.f64);
2045
break;
2046
default:
2047
unreachable("Invalid pipeline statistic format");
2048
}
2049
fprintf(output, "\n");
2050
}
2051
2052
fprintf(output, "********************\n\n\n");
2053
2054
free(stats);
2055
}
2056
2057
fail:
2058
free(props);
2059
return result;
2060
}
2061
2062