Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_pipeline.c
7263 views
1
/*
2
* Copyright © 2016 Red Hat.
3
* Copyright © 2016 Bas Nieuwenhuizen
4
*
5
* based in part on anv driver which is:
6
* Copyright © 2015 Intel Corporation
7
*
8
* Permission is hereby granted, free of charge, to any person obtaining a
9
* copy of this software and associated documentation files (the "Software"),
10
* to deal in the Software without restriction, including without limitation
11
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
* and/or sell copies of the Software, and to permit persons to whom the
13
* Software is furnished to do so, subject to the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the next
16
* paragraph) shall be included in all copies or substantial portions of the
17
* Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
* IN THE SOFTWARE.
26
*/
27
28
#include "nir/nir.h"
29
#include "nir/nir_builder.h"
30
#include "nir/nir_xfb_info.h"
31
#include "spirv/nir_spirv.h"
32
#include "util/disk_cache.h"
33
#include "util/mesa-sha1.h"
34
#include "util/u_atomic.h"
35
#include "radv_cs.h"
36
#include "radv_debug.h"
37
#include "radv_private.h"
38
#include "radv_shader.h"
39
#include "vk_util.h"
40
41
#include "util/debug.h"
42
#include "ac_binary.h"
43
#include "ac_exp_param.h"
44
#include "ac_nir.h"
45
#include "ac_shader_util.h"
46
#include "aco_interface.h"
47
#include "sid.h"
48
#include "vk_format.h"
49
50
struct radv_blend_state {
51
uint32_t blend_enable_4bit;
52
uint32_t need_src_alpha;
53
54
uint32_t cb_target_mask;
55
uint32_t cb_target_enabled_4bit;
56
uint32_t sx_mrt_blend_opt[8];
57
uint32_t cb_blend_control[8];
58
59
uint32_t spi_shader_col_format;
60
uint32_t col_format_is_int8;
61
uint32_t col_format_is_int10;
62
uint32_t cb_shader_mask;
63
uint32_t db_alpha_to_mask;
64
65
uint32_t commutative_4bit;
66
67
bool single_cb_enable;
68
bool mrt0_is_dual_src;
69
};
70
71
struct radv_dsa_order_invariance {
72
/* Whether the final result in Z/S buffers is guaranteed to be
73
* invariant under changes to the order in which fragments arrive.
74
*/
75
bool zs;
76
77
/* Whether the set of fragments that pass the combined Z/S test is
78
* guaranteed to be invariant under changes to the order in which
79
* fragments arrive.
80
*/
81
bool pass_set;
82
};
83
84
static bool
85
radv_is_state_dynamic(const VkGraphicsPipelineCreateInfo *pCreateInfo, VkDynamicState state)
86
{
87
if (pCreateInfo->pDynamicState) {
88
uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
89
for (uint32_t i = 0; i < count; i++) {
90
if (pCreateInfo->pDynamicState->pDynamicStates[i] == state)
91
return true;
92
}
93
}
94
95
return false;
96
}
97
98
static const VkPipelineMultisampleStateCreateInfo *
99
radv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
100
{
101
if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
102
radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
103
return pCreateInfo->pMultisampleState;
104
return NULL;
105
}
106
107
static const VkPipelineTessellationStateCreateInfo *
108
radv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
109
{
110
for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
111
if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT ||
112
pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) {
113
return pCreateInfo->pTessellationState;
114
}
115
}
116
return NULL;
117
}
118
119
static const VkPipelineDepthStencilStateCreateInfo *
120
radv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
121
{
122
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
123
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
124
125
if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
126
subpass->depth_stencil_attachment) ||
127
radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
128
return pCreateInfo->pDepthStencilState;
129
return NULL;
130
}
131
132
static const VkPipelineColorBlendStateCreateInfo *
133
radv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
134
{
135
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
136
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
137
138
if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && subpass->has_color_att) ||
139
radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
140
return pCreateInfo->pColorBlendState;
141
return NULL;
142
}
143
144
static bool
145
radv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
146
{
147
if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
148
return false;
149
150
struct radv_shader_variant *variant =
151
pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
152
153
return variant->info.is_ngg;
154
}
155
156
bool
157
radv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline)
158
{
159
if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
160
return false;
161
162
assert(radv_pipeline_has_ngg(pipeline));
163
164
struct radv_shader_variant *variant =
165
pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
166
167
return variant->info.is_ngg_passthrough;
168
}
169
170
bool
171
radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
172
{
173
return !!pipeline->gs_copy_shader;
174
}
175
176
static void
177
radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
178
const VkAllocationCallbacks *allocator)
179
{
180
for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
181
if (pipeline->shaders[i])
182
radv_shader_variant_destroy(device, pipeline->shaders[i]);
183
184
if (pipeline->gs_copy_shader)
185
radv_shader_variant_destroy(device, pipeline->gs_copy_shader);
186
187
if (pipeline->cs.buf)
188
free(pipeline->cs.buf);
189
190
vk_object_base_finish(&pipeline->base);
191
vk_free2(&device->vk.alloc, allocator, pipeline);
192
}
193
194
void
195
radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
196
const VkAllocationCallbacks *pAllocator)
197
{
198
RADV_FROM_HANDLE(radv_device, device, _device);
199
RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
200
201
if (!_pipeline)
202
return;
203
204
radv_pipeline_destroy(device, pipeline, pAllocator);
205
}
206
207
static uint32_t
208
get_hash_flags(const struct radv_device *device, bool stats)
209
{
210
uint32_t hash_flags = 0;
211
212
if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
213
hash_flags |= RADV_HASH_SHADER_NO_NGG;
214
if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
215
hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
216
if (device->physical_device->cs_wave_size == 32)
217
hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
218
if (device->physical_device->ps_wave_size == 32)
219
hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
220
if (device->physical_device->ge_wave_size == 32)
221
hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
222
if (device->physical_device->use_llvm)
223
hash_flags |= RADV_HASH_SHADER_LLVM;
224
if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
225
hash_flags |= RADV_HASH_SHADER_DISCARD_TO_DEMOTE;
226
if (device->instance->enable_mrt_output_nan_fixup)
227
hash_flags |= RADV_HASH_SHADER_MRT_NAN_FIXUP;
228
if (device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
229
hash_flags |= RADV_HASH_SHADER_INVARIANT_GEOM;
230
if (stats)
231
hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
232
if (device->force_vrs != RADV_FORCE_VRS_2x2)
233
hash_flags |= RADV_HASH_SHADER_FORCE_VRS_2x2;
234
if (device->force_vrs != RADV_FORCE_VRS_2x1)
235
hash_flags |= RADV_HASH_SHADER_FORCE_VRS_2x1;
236
if (device->force_vrs != RADV_FORCE_VRS_1x2)
237
hash_flags |= RADV_HASH_SHADER_FORCE_VRS_1x2;
238
return hash_flags;
239
}
240
241
static void
242
radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
243
{
244
unsigned scratch_bytes_per_wave = 0;
245
unsigned max_waves = 0;
246
unsigned min_waves = 1;
247
248
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
249
if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
250
unsigned max_stage_waves = device->scratch_waves;
251
252
scratch_bytes_per_wave =
253
MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);
254
255
max_stage_waves =
256
MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_good_compute_units *
257
(256 / pipeline->shaders[i]->config.num_vgprs));
258
max_waves = MAX2(max_waves, max_stage_waves);
259
}
260
}
261
262
if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
263
unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
264
pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
265
pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
266
min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
267
}
268
269
pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
270
pipeline->max_waves = max_waves;
271
}
272
273
static uint32_t
274
si_translate_blend_function(VkBlendOp op)
275
{
276
switch (op) {
277
case VK_BLEND_OP_ADD:
278
return V_028780_COMB_DST_PLUS_SRC;
279
case VK_BLEND_OP_SUBTRACT:
280
return V_028780_COMB_SRC_MINUS_DST;
281
case VK_BLEND_OP_REVERSE_SUBTRACT:
282
return V_028780_COMB_DST_MINUS_SRC;
283
case VK_BLEND_OP_MIN:
284
return V_028780_COMB_MIN_DST_SRC;
285
case VK_BLEND_OP_MAX:
286
return V_028780_COMB_MAX_DST_SRC;
287
default:
288
return 0;
289
}
290
}
291
292
static uint32_t
293
si_translate_blend_factor(VkBlendFactor factor)
294
{
295
switch (factor) {
296
case VK_BLEND_FACTOR_ZERO:
297
return V_028780_BLEND_ZERO;
298
case VK_BLEND_FACTOR_ONE:
299
return V_028780_BLEND_ONE;
300
case VK_BLEND_FACTOR_SRC_COLOR:
301
return V_028780_BLEND_SRC_COLOR;
302
case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
303
return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
304
case VK_BLEND_FACTOR_DST_COLOR:
305
return V_028780_BLEND_DST_COLOR;
306
case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
307
return V_028780_BLEND_ONE_MINUS_DST_COLOR;
308
case VK_BLEND_FACTOR_SRC_ALPHA:
309
return V_028780_BLEND_SRC_ALPHA;
310
case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
311
return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
312
case VK_BLEND_FACTOR_DST_ALPHA:
313
return V_028780_BLEND_DST_ALPHA;
314
case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
315
return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
316
case VK_BLEND_FACTOR_CONSTANT_COLOR:
317
return V_028780_BLEND_CONSTANT_COLOR;
318
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
319
return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
320
case VK_BLEND_FACTOR_CONSTANT_ALPHA:
321
return V_028780_BLEND_CONSTANT_ALPHA;
322
case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
323
return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
324
case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
325
return V_028780_BLEND_SRC_ALPHA_SATURATE;
326
case VK_BLEND_FACTOR_SRC1_COLOR:
327
return V_028780_BLEND_SRC1_COLOR;
328
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
329
return V_028780_BLEND_INV_SRC1_COLOR;
330
case VK_BLEND_FACTOR_SRC1_ALPHA:
331
return V_028780_BLEND_SRC1_ALPHA;
332
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
333
return V_028780_BLEND_INV_SRC1_ALPHA;
334
default:
335
return 0;
336
}
337
}
338
339
static uint32_t
340
si_translate_blend_opt_function(VkBlendOp op)
341
{
342
switch (op) {
343
case VK_BLEND_OP_ADD:
344
return V_028760_OPT_COMB_ADD;
345
case VK_BLEND_OP_SUBTRACT:
346
return V_028760_OPT_COMB_SUBTRACT;
347
case VK_BLEND_OP_REVERSE_SUBTRACT:
348
return V_028760_OPT_COMB_REVSUBTRACT;
349
case VK_BLEND_OP_MIN:
350
return V_028760_OPT_COMB_MIN;
351
case VK_BLEND_OP_MAX:
352
return V_028760_OPT_COMB_MAX;
353
default:
354
return V_028760_OPT_COMB_BLEND_DISABLED;
355
}
356
}
357
358
static uint32_t
359
si_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha)
360
{
361
switch (factor) {
362
case VK_BLEND_FACTOR_ZERO:
363
return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
364
case VK_BLEND_FACTOR_ONE:
365
return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
366
case VK_BLEND_FACTOR_SRC_COLOR:
367
return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
368
: V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
369
case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
370
return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
371
: V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
372
case VK_BLEND_FACTOR_SRC_ALPHA:
373
return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
374
case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
375
return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
376
case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
377
return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
378
: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
379
default:
380
return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
381
}
382
}
383
384
/**
385
* Get rid of DST in the blend factors by commuting the operands:
386
* func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
387
*/
388
static void
389
si_blend_remove_dst(VkBlendOp *func, VkBlendFactor *src_factor, VkBlendFactor *dst_factor,
390
VkBlendFactor expected_dst, VkBlendFactor replacement_src)
391
{
392
if (*src_factor == expected_dst && *dst_factor == VK_BLEND_FACTOR_ZERO) {
393
*src_factor = VK_BLEND_FACTOR_ZERO;
394
*dst_factor = replacement_src;
395
396
/* Commuting the operands requires reversing subtractions. */
397
if (*func == VK_BLEND_OP_SUBTRACT)
398
*func = VK_BLEND_OP_REVERSE_SUBTRACT;
399
else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT)
400
*func = VK_BLEND_OP_SUBTRACT;
401
}
402
}
403
404
static bool
405
si_blend_factor_uses_dst(VkBlendFactor factor)
406
{
407
return factor == VK_BLEND_FACTOR_DST_COLOR || factor == VK_BLEND_FACTOR_DST_ALPHA ||
408
factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
409
factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA ||
410
factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
411
}
412
413
static bool
414
is_dual_src(VkBlendFactor factor)
415
{
416
switch (factor) {
417
case VK_BLEND_FACTOR_SRC1_COLOR:
418
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
419
case VK_BLEND_FACTOR_SRC1_ALPHA:
420
case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
421
return true;
422
default:
423
return false;
424
}
425
}
426
427
static unsigned
428
radv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format,
429
bool blend_enable, bool blend_need_alpha)
430
{
431
const struct util_format_description *desc = vk_format_description(vk_format);
432
bool use_rbplus = device->physical_device->rad_info.rbplus_allowed;
433
struct ac_spi_color_formats formats = {0};
434
unsigned format, ntype, swap;
435
436
format = radv_translate_colorformat(vk_format);
437
ntype = radv_translate_color_numformat(vk_format, desc,
438
vk_format_get_first_non_void_channel(vk_format));
439
swap = radv_translate_colorswap(vk_format, false);
440
441
ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats);
442
443
if (blend_enable && blend_need_alpha)
444
return formats.blend_alpha;
445
else if (blend_need_alpha)
446
return formats.alpha;
447
else if (blend_enable)
448
return formats.blend;
449
else
450
return formats.normal;
451
}
452
453
static bool
454
format_is_int8(VkFormat format)
455
{
456
const struct util_format_description *desc = vk_format_description(format);
457
int channel = vk_format_get_first_non_void_channel(format);
458
459
return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8;
460
}
461
462
static bool
463
format_is_int10(VkFormat format)
464
{
465
const struct util_format_description *desc = vk_format_description(format);
466
467
if (desc->nr_channels != 4)
468
return false;
469
for (unsigned i = 0; i < 4; i++) {
470
if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
471
return true;
472
}
473
return false;
474
}
475
476
static void
477
radv_pipeline_compute_spi_color_formats(const struct radv_pipeline *pipeline,
478
const VkGraphicsPipelineCreateInfo *pCreateInfo,
479
struct radv_blend_state *blend)
480
{
481
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
482
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
483
unsigned col_format = 0, is_int8 = 0, is_int10 = 0;
484
unsigned num_targets;
485
486
for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
487
unsigned cf;
488
489
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED ||
490
!(blend->cb_target_mask & (0xfu << (i * 4)))) {
491
cf = V_028714_SPI_SHADER_ZERO;
492
} else {
493
struct radv_render_pass_attachment *attachment =
494
pass->attachments + subpass->color_attachments[i].attachment;
495
bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4));
496
497
cf = radv_choose_spi_color_format(pipeline->device, attachment->format, blend_enable,
498
blend->need_src_alpha & (1 << i));
499
500
if (format_is_int8(attachment->format))
501
is_int8 |= 1 << i;
502
if (format_is_int10(attachment->format))
503
is_int10 |= 1 << i;
504
}
505
506
col_format |= cf << (4 * i);
507
}
508
509
if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
510
/* When a subpass doesn't have any color attachments, write the
511
* alpha channel of MRT0 when alpha coverage is enabled because
512
* the depth attachment needs it.
513
*/
514
col_format |= V_028714_SPI_SHADER_32_AR;
515
}
516
517
/* If the i-th target format is set, all previous target formats must
518
* be non-zero to avoid hangs.
519
*/
520
num_targets = (util_last_bit(col_format) + 3) / 4;
521
for (unsigned i = 0; i < num_targets; i++) {
522
if (!(col_format & (0xfu << (i * 4)))) {
523
col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
524
}
525
}
526
527
/* The output for dual source blending should have the same format as
528
* the first output.
529
*/
530
if (blend->mrt0_is_dual_src) {
531
assert(!(col_format >> 4));
532
col_format |= (col_format & 0xf) << 4;
533
}
534
535
blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);
536
blend->spi_shader_col_format = col_format;
537
blend->col_format_is_int8 = is_int8;
538
blend->col_format_is_int10 = is_int10;
539
}
540
541
/*
542
* Ordered so that for each i,
543
* radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
544
*/
545
const VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
546
VK_FORMAT_R32_SFLOAT,
547
VK_FORMAT_R32G32_SFLOAT,
548
VK_FORMAT_R8G8B8A8_UNORM,
549
VK_FORMAT_R16G16B16A16_UNORM,
550
VK_FORMAT_R16G16B16A16_SNORM,
551
VK_FORMAT_R16G16B16A16_UINT,
552
VK_FORMAT_R16G16B16A16_SINT,
553
VK_FORMAT_R32G32B32A32_SFLOAT,
554
VK_FORMAT_R8G8B8A8_UINT,
555
VK_FORMAT_R8G8B8A8_SINT,
556
VK_FORMAT_A2R10G10B10_UINT_PACK32,
557
VK_FORMAT_A2R10G10B10_SINT_PACK32,
558
};
559
560
unsigned
561
radv_format_meta_fs_key(struct radv_device *device, VkFormat format)
562
{
563
unsigned col_format = radv_choose_spi_color_format(device, format, false, false);
564
assert(col_format != V_028714_SPI_SHADER_32_AR);
565
566
bool is_int8 = format_is_int8(format);
567
bool is_int10 = format_is_int10(format);
568
569
if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8)
570
return 8;
571
else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8)
572
return 9;
573
else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10)
574
return 10;
575
else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10)
576
return 11;
577
else {
578
if (col_format >= V_028714_SPI_SHADER_32_AR)
579
--col_format; /* Skip V_028714_SPI_SHADER_32_AR since there is no such VkFormat */
580
581
--col_format; /* Skip V_028714_SPI_SHADER_ZERO */
582
return col_format;
583
}
584
}
585
586
static void
587
radv_blend_check_commutativity(struct radv_blend_state *blend, VkBlendOp op, VkBlendFactor src,
588
VkBlendFactor dst, unsigned chanmask)
589
{
590
/* Src factor is allowed when it does not depend on Dst. */
591
static const uint32_t src_allowed =
592
(1u << VK_BLEND_FACTOR_ONE) | (1u << VK_BLEND_FACTOR_SRC_COLOR) |
593
(1u << VK_BLEND_FACTOR_SRC_ALPHA) | (1u << VK_BLEND_FACTOR_SRC_ALPHA_SATURATE) |
594
(1u << VK_BLEND_FACTOR_CONSTANT_COLOR) | (1u << VK_BLEND_FACTOR_CONSTANT_ALPHA) |
595
(1u << VK_BLEND_FACTOR_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_SRC1_ALPHA) |
596
(1u << VK_BLEND_FACTOR_ZERO) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR) |
597
(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) |
598
(1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR) |
599
(1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA) |
600
(1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA);
601
602
if (dst == VK_BLEND_FACTOR_ONE && (src_allowed & (1u << src))) {
603
/* Addition is commutative, but floating point addition isn't
604
* associative: subtle changes can be introduced via different
605
* rounding. Be conservative, only enable for min and max.
606
*/
607
if (op == VK_BLEND_OP_MAX || op == VK_BLEND_OP_MIN)
608
blend->commutative_4bit |= chanmask;
609
}
610
}
611
612
static struct radv_blend_state
613
radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
614
const VkGraphicsPipelineCreateInfo *pCreateInfo,
615
const struct radv_graphics_pipeline_create_info *extra)
616
{
617
const VkPipelineColorBlendStateCreateInfo *vkblend =
618
radv_pipeline_get_color_blend_state(pCreateInfo);
619
const VkPipelineMultisampleStateCreateInfo *vkms =
620
radv_pipeline_get_multisample_state(pCreateInfo);
621
struct radv_blend_state blend = {0};
622
unsigned mode = V_028808_CB_NORMAL;
623
unsigned cb_color_control = 0;
624
int i;
625
626
if (extra && extra->custom_blend_mode) {
627
blend.single_cb_enable = true;
628
mode = extra->custom_blend_mode;
629
}
630
631
if (vkblend) {
632
if (vkblend->logicOpEnable)
633
cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp));
634
else
635
cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
636
}
637
638
blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
639
S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
640
S_028B70_OFFSET_ROUND(1);
641
642
if (vkms && vkms->alphaToCoverageEnable) {
643
blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
644
blend.need_src_alpha |= 0x1;
645
}
646
647
blend.cb_target_mask = 0;
648
if (vkblend) {
649
for (i = 0; i < vkblend->attachmentCount; i++) {
650
const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
651
unsigned blend_cntl = 0;
652
unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
653
VkBlendOp eqRGB = att->colorBlendOp;
654
VkBlendFactor srcRGB = att->srcColorBlendFactor;
655
VkBlendFactor dstRGB = att->dstColorBlendFactor;
656
VkBlendOp eqA = att->alphaBlendOp;
657
VkBlendFactor srcA = att->srcAlphaBlendFactor;
658
VkBlendFactor dstA = att->dstAlphaBlendFactor;
659
660
blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
661
S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
662
663
if (!att->colorWriteMask)
664
continue;
665
666
/* Ignore other blend targets if dual-source blending
667
* is enabled to prevent wrong behaviour.
668
*/
669
if (blend.mrt0_is_dual_src)
670
continue;
671
672
blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i);
673
blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
674
if (!att->blendEnable) {
675
blend.cb_blend_control[i] = blend_cntl;
676
continue;
677
}
678
679
if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA))
680
if (i == 0)
681
blend.mrt0_is_dual_src = true;
682
683
if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) {
684
srcRGB = VK_BLEND_FACTOR_ONE;
685
dstRGB = VK_BLEND_FACTOR_ONE;
686
}
687
if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) {
688
srcA = VK_BLEND_FACTOR_ONE;
689
dstA = VK_BLEND_FACTOR_ONE;
690
}
691
692
radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i));
693
radv_blend_check_commutativity(&blend, eqA, srcA, dstA, 0x8u << (4 * i));
694
695
/* Blending optimizations for RB+.
696
* These transformations don't change the behavior.
697
*
698
* First, get rid of DST in the blend factors:
699
* func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
700
*/
701
si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR,
702
VK_BLEND_FACTOR_SRC_COLOR);
703
704
si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR,
705
VK_BLEND_FACTOR_SRC_COLOR);
706
707
si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA,
708
VK_BLEND_FACTOR_SRC_ALPHA);
709
710
/* Look up the ideal settings from tables. */
711
srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
712
dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
713
srcA_opt = si_translate_blend_opt_factor(srcA, true);
714
dstA_opt = si_translate_blend_opt_factor(dstA, true);
715
716
/* Handle interdependencies. */
717
if (si_blend_factor_uses_dst(srcRGB))
718
dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
719
if (si_blend_factor_uses_dst(srcA))
720
dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
721
722
if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
723
(dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
724
dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
725
dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
726
727
/* Set the final value. */
728
blend.sx_mrt_blend_opt[i] =
729
S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
730
S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
731
S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
732
S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
733
blend_cntl |= S_028780_ENABLE(1);
734
735
blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
736
blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
737
blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
738
if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
739
blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
740
blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
741
blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
742
blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
743
}
744
blend.cb_blend_control[i] = blend_cntl;
745
746
blend.blend_enable_4bit |= 0xfu << (i * 4);
747
748
if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
749
srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
750
dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
751
srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA ||
752
dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
753
blend.need_src_alpha |= 1 << i;
754
}
755
for (i = vkblend->attachmentCount; i < 8; i++) {
756
blend.cb_blend_control[i] = 0;
757
blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
758
S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
759
}
760
}
761
762
if (pipeline->device->physical_device->rad_info.has_rbplus) {
763
/* Disable RB+ blend optimizations for dual source blending. */
764
if (blend.mrt0_is_dual_src) {
765
for (i = 0; i < 8; i++) {
766
blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
767
S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
768
}
769
}
770
771
/* RB+ doesn't work with dual source blending, logic op and
772
* RESOLVE.
773
*/
774
if (blend.mrt0_is_dual_src || (vkblend && vkblend->logicOpEnable) ||
775
mode == V_028808_CB_RESOLVE)
776
cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
777
}
778
779
if (blend.cb_target_mask)
780
cb_color_control |= S_028808_MODE(mode);
781
else
782
cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
783
784
radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend);
785
786
pipeline->graphics.cb_color_control = cb_color_control;
787
788
return blend;
789
}
790
791
static uint32_t
792
si_translate_fill(VkPolygonMode func)
793
{
794
switch (func) {
795
case VK_POLYGON_MODE_FILL:
796
return V_028814_X_DRAW_TRIANGLES;
797
case VK_POLYGON_MODE_LINE:
798
return V_028814_X_DRAW_LINES;
799
case VK_POLYGON_MODE_POINT:
800
return V_028814_X_DRAW_POINTS;
801
default:
802
assert(0);
803
return V_028814_X_DRAW_POINTS;
804
}
805
}
806
807
static uint8_t
808
radv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo)
809
{
810
const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState;
811
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
812
struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
813
uint32_t ps_iter_samples = 1;
814
uint32_t num_samples;
815
816
/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
817
*
818
* "If the VK_AMD_mixed_attachment_samples extension is enabled and the
819
* subpass uses color attachments, totalSamples is the number of
820
* samples of the color attachments. Otherwise, totalSamples is the
821
* value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples
822
* specified at pipeline creation time."
823
*/
824
if (subpass->has_color_att) {
825
num_samples = subpass->color_sample_count;
826
} else {
827
num_samples = vkms->rasterizationSamples;
828
}
829
830
if (vkms->sampleShadingEnable) {
831
ps_iter_samples = ceilf(vkms->minSampleShading * num_samples);
832
ps_iter_samples = util_next_power_of_two(ps_iter_samples);
833
}
834
return ps_iter_samples;
835
}
836
837
static bool
838
radv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
839
{
840
return pCreateInfo->depthTestEnable && pCreateInfo->depthWriteEnable &&
841
pCreateInfo->depthCompareOp != VK_COMPARE_OP_NEVER;
842
}
843
844
static bool
845
radv_writes_stencil(const VkStencilOpState *state)
846
{
847
return state->writeMask &&
848
(state->failOp != VK_STENCIL_OP_KEEP || state->passOp != VK_STENCIL_OP_KEEP ||
849
state->depthFailOp != VK_STENCIL_OP_KEEP);
850
}
851
852
static bool
853
radv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
854
{
855
return pCreateInfo->stencilTestEnable &&
856
(radv_writes_stencil(&pCreateInfo->front) || radv_writes_stencil(&pCreateInfo->back));
857
}
858
859
static bool
860
radv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
861
{
862
return radv_is_depth_write_enabled(pCreateInfo) || radv_is_stencil_write_enabled(pCreateInfo);
863
}
864
865
static bool
866
radv_order_invariant_stencil_op(VkStencilOp op)
867
{
868
/* REPLACE is normally order invariant, except when the stencil
869
* reference value is written by the fragment shader. Tracking this
870
* interaction does not seem worth the effort, so be conservative.
871
*/
872
return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
873
op != VK_STENCIL_OP_REPLACE;
874
}
875
876
static bool
877
radv_order_invariant_stencil_state(const VkStencilOpState *state)
878
{
879
/* Compute whether, assuming Z writes are disabled, this stencil state
880
* is order invariant in the sense that the set of passing fragments as
881
* well as the final stencil buffer result does not depend on the order
882
* of fragments.
883
*/
884
return !state->writeMask ||
885
/* The following assumes that Z writes are disabled. */
886
(state->compareOp == VK_COMPARE_OP_ALWAYS &&
887
radv_order_invariant_stencil_op(state->passOp) &&
888
radv_order_invariant_stencil_op(state->depthFailOp)) ||
889
(state->compareOp == VK_COMPARE_OP_NEVER &&
890
radv_order_invariant_stencil_op(state->failOp));
891
}
892
893
static bool
894
radv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo *pCreateInfo)
895
{
896
VkDynamicState ds_states[] = {
897
VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
898
VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
899
VK_DYNAMIC_STATE_STENCIL_OP_EXT,
900
};
901
902
for (uint32_t i = 0; i < ARRAY_SIZE(ds_states); i++) {
903
if (radv_is_state_dynamic(pCreateInfo, ds_states[i]))
904
return true;
905
}
906
907
return false;
908
}
909
910
static bool
911
radv_pipeline_out_of_order_rast(struct radv_pipeline *pipeline,
912
const struct radv_blend_state *blend,
913
const VkGraphicsPipelineCreateInfo *pCreateInfo)
914
{
915
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
916
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
917
const VkPipelineDepthStencilStateCreateInfo *vkds =
918
radv_pipeline_get_depth_stencil_state(pCreateInfo);
919
const VkPipelineColorBlendStateCreateInfo *vkblend =
920
radv_pipeline_get_color_blend_state(pCreateInfo);
921
unsigned colormask = blend->cb_target_enabled_4bit;
922
923
if (!pipeline->device->physical_device->out_of_order_rast_allowed)
924
return false;
925
926
/* Be conservative if a logic operation is enabled with color buffers. */
927
if (colormask && vkblend && vkblend->logicOpEnable)
928
return false;
929
930
/* Be conservative if an extended dynamic depth/stencil state is
931
* enabled because the driver can't update out-of-order rasterization
932
* dynamically.
933
*/
934
if (radv_pipeline_has_dynamic_ds_states(pCreateInfo))
935
return false;
936
937
/* Default depth/stencil invariance when no attachment is bound. */
938
struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true};
939
940
if (vkds) {
941
struct radv_render_pass_attachment *attachment =
942
pass->attachments + subpass->depth_stencil_attachment->attachment;
943
bool has_stencil = vk_format_has_stencil(attachment->format);
944
struct radv_dsa_order_invariance order_invariance[2];
945
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
946
947
/* Compute depth/stencil order invariance in order to know if
948
* it's safe to enable out-of-order.
949
*/
950
bool zfunc_is_ordered = vkds->depthCompareOp == VK_COMPARE_OP_NEVER ||
951
vkds->depthCompareOp == VK_COMPARE_OP_LESS ||
952
vkds->depthCompareOp == VK_COMPARE_OP_LESS_OR_EQUAL ||
953
vkds->depthCompareOp == VK_COMPARE_OP_GREATER ||
954
vkds->depthCompareOp == VK_COMPARE_OP_GREATER_OR_EQUAL;
955
956
bool nozwrite_and_order_invariant_stencil =
957
!radv_is_ds_write_enabled(vkds) ||
958
(!radv_is_depth_write_enabled(vkds) && radv_order_invariant_stencil_state(&vkds->front) &&
959
radv_order_invariant_stencil_state(&vkds->back));
960
961
order_invariance[1].zs = nozwrite_and_order_invariant_stencil ||
962
(!radv_is_stencil_write_enabled(vkds) && zfunc_is_ordered);
963
order_invariance[0].zs = !radv_is_depth_write_enabled(vkds) || zfunc_is_ordered;
964
965
order_invariance[1].pass_set =
966
nozwrite_and_order_invariant_stencil ||
967
(!radv_is_stencil_write_enabled(vkds) && (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
968
vkds->depthCompareOp == VK_COMPARE_OP_NEVER));
969
order_invariance[0].pass_set =
970
!radv_is_depth_write_enabled(vkds) || (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
971
vkds->depthCompareOp == VK_COMPARE_OP_NEVER);
972
973
dsa_order_invariant = order_invariance[has_stencil];
974
if (!dsa_order_invariant.zs)
975
return false;
976
977
/* The set of PS invocations is always order invariant,
978
* except when early Z/S tests are requested.
979
*/
980
if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test &&
981
!dsa_order_invariant.pass_set)
982
return false;
983
984
/* Determine if out-of-order rasterization should be disabled
985
* when occlusion queries are used.
986
*/
987
pipeline->graphics.disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set;
988
}
989
990
/* No color buffers are enabled for writing. */
991
if (!colormask)
992
return true;
993
994
unsigned blendmask = colormask & blend->blend_enable_4bit;
995
996
if (blendmask) {
997
/* Only commutative blending. */
998
if (blendmask & ~blend->commutative_4bit)
999
return false;
1000
1001
if (!dsa_order_invariant.pass_set)
1002
return false;
1003
}
1004
1005
if (colormask & ~blendmask)
1006
return false;
1007
1008
return true;
1009
}
1010
1011
static const VkConservativeRasterizationModeEXT
1012
radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *pCreateInfo)
1013
{
1014
const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
1015
vk_find_struct_const(pCreateInfo->pNext,
1016
PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
1017
1018
if (!conservative_raster)
1019
return VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
1020
return conservative_raster->conservativeRasterizationMode;
1021
}
1022
1023
static void
1024
radv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
1025
const struct radv_blend_state *blend,
1026
const VkGraphicsPipelineCreateInfo *pCreateInfo)
1027
{
1028
const VkPipelineMultisampleStateCreateInfo *vkms =
1029
radv_pipeline_get_multisample_state(pCreateInfo);
1030
struct radv_multisample_state *ms = &pipeline->graphics.ms;
1031
unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes;
1032
const VkConservativeRasterizationModeEXT mode =
1033
radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState);
1034
bool out_of_order_rast = false;
1035
int ps_iter_samples = 1;
1036
uint32_t mask = 0xffff;
1037
1038
if (vkms) {
1039
ms->num_samples = vkms->rasterizationSamples;
1040
1041
/* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
1042
*
1043
* "Sample shading is enabled for a graphics pipeline:
1044
*
1045
* - If the interface of the fragment shader entry point of the
1046
* graphics pipeline includes an input variable decorated
1047
* with SampleId or SamplePosition. In this case
1048
* minSampleShadingFactor takes the value 1.0.
1049
* - Else if the sampleShadingEnable member of the
1050
* VkPipelineMultisampleStateCreateInfo structure specified
1051
* when creating the graphics pipeline is set to VK_TRUE. In
1052
* this case minSampleShadingFactor takes the value of
1053
* VkPipelineMultisampleStateCreateInfo::minSampleShading.
1054
*
1055
* Otherwise, sample shading is considered disabled."
1056
*/
1057
if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) {
1058
ps_iter_samples = ms->num_samples;
1059
} else {
1060
ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
1061
}
1062
} else {
1063
ms->num_samples = 1;
1064
}
1065
1066
const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
1067
vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1068
PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
1069
if (raster_order && raster_order->rasterizationOrder == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
1070
/* Out-of-order rasterization is explicitly enabled by the
1071
* application.
1072
*/
1073
out_of_order_rast = true;
1074
} else {
1075
/* Determine if the driver can enable out-of-order
1076
* rasterization internally.
1077
*/
1078
out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, pCreateInfo);
1079
}
1080
1081
ms->pa_sc_aa_config = 0;
1082
ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
1083
S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
1084
1085
/* Adjust MSAA state if conservative rasterization is enabled. */
1086
if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
1087
ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
1088
1089
ms->db_eqaa |=
1090
S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4);
1091
}
1092
1093
ms->pa_sc_mode_cntl_1 =
1094
S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
1095
S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
1096
S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
1097
S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
1098
/* always 1: */
1099
S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
1100
S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
1101
S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
1102
ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(
1103
pipeline->device->physical_device->rad_info.chip_class >= GFX9) |
1104
S_028A48_VPORT_SCISSOR_ENABLE(1);
1105
1106
const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line = vk_find_struct_const(
1107
pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1108
if (rast_line) {
1109
ms->pa_sc_mode_cntl_0 |= S_028A48_LINE_STIPPLE_ENABLE(rast_line->stippledLineEnable);
1110
if (rast_line->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
1111
/* From the Vulkan spec 1.1.129:
1112
*
1113
* "When VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT lines
1114
* are being rasterized, sample locations may all be
1115
* treated as being at the pixel center (this may
1116
* affect attribute and depth interpolation)."
1117
*/
1118
ms->num_samples = 1;
1119
}
1120
}
1121
1122
if (ms->num_samples > 1) {
1123
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1124
struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1125
uint32_t z_samples =
1126
subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples;
1127
unsigned log_samples = util_logbase2(ms->num_samples);
1128
unsigned log_z_samples = util_logbase2(z_samples);
1129
unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
1130
ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
1131
ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
1132
S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
1133
S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
1134
S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
1135
ms->pa_sc_aa_config |=
1136
S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
1137
S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
1138
S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
1139
S_028BE0_COVERED_CENTROID_IS_CENTER(
1140
pipeline->device->physical_device->rad_info.chip_class >= GFX10_3);
1141
ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
1142
if (ps_iter_samples > 1)
1143
pipeline->graphics.spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
1144
}
1145
1146
if (vkms && vkms->pSampleMask) {
1147
mask = vkms->pSampleMask[0] & 0xffff;
1148
}
1149
1150
ms->pa_sc_aa_mask[0] = mask | (mask << 16);
1151
ms->pa_sc_aa_mask[1] = mask | (mask << 16);
1152
}
1153
1154
static void
1155
gfx103_pipeline_init_vrs_state(struct radv_pipeline *pipeline,
1156
const VkGraphicsPipelineCreateInfo *pCreateInfo)
1157
{
1158
const VkPipelineMultisampleStateCreateInfo *vkms =
1159
radv_pipeline_get_multisample_state(pCreateInfo);
1160
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
1161
struct radv_multisample_state *ms = &pipeline->graphics.ms;
1162
struct radv_vrs_state *vrs = &pipeline->graphics.vrs;
1163
1164
if (vkms && (vkms->sampleShadingEnable || ps->info.ps.uses_sample_shading ||
1165
ps->info.ps.reads_sample_mask_in)) {
1166
/* Disable VRS and use the rates from PS_ITER_SAMPLES if:
1167
*
1168
* 1) sample shading is enabled or per-sample interpolation is
1169
* used by the fragment shader
1170
* 2) the fragment shader reads gl_SampleMaskIn because the
1171
* 16-bit sample coverage mask isn't enough for MSAA8x and
1172
* 2x2 coarse shading isn't enough.
1173
*/
1174
vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE);
1175
1176
/* Make sure sample shading is enabled even if only MSAA1x is
1177
* used because the SAMPLE_ITER combiner is in passthrough
1178
* mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate.
1179
* The default VRS rate when sample shading is enabled is 1x1.
1180
*/
1181
if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1))
1182
ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
1183
} else {
1184
vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1185
}
1186
1187
/* The primitive combiner is always passthrough. */
1188
vrs->pa_cl_vrs_cntl |= S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
1189
}
1190
1191
static bool
1192
radv_prim_can_use_guardband(enum VkPrimitiveTopology topology)
1193
{
1194
switch (topology) {
1195
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1196
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1197
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1198
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1199
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1200
return false;
1201
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1202
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1203
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1204
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1205
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1206
case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1207
return true;
1208
default:
1209
unreachable("unhandled primitive type");
1210
}
1211
}
1212
1213
static uint32_t
1214
si_conv_gl_prim_to_gs_out(unsigned gl_prim)
1215
{
1216
switch (gl_prim) {
1217
case 0: /* GL_POINTS */
1218
return V_028A6C_POINTLIST;
1219
case 1: /* GL_LINES */
1220
case 3: /* GL_LINE_STRIP */
1221
case 0xA: /* GL_LINE_STRIP_ADJACENCY_ARB */
1222
case 0x8E7A: /* GL_ISOLINES */
1223
return V_028A6C_LINESTRIP;
1224
1225
case 4: /* GL_TRIANGLES */
1226
case 0xc: /* GL_TRIANGLES_ADJACENCY_ARB */
1227
case 5: /* GL_TRIANGLE_STRIP */
1228
case 7: /* GL_QUADS */
1229
return V_028A6C_TRISTRIP;
1230
default:
1231
assert(0);
1232
return 0;
1233
}
1234
}
1235
1236
static uint32_t
1237
si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
1238
{
1239
switch (topology) {
1240
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1241
case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1242
return V_028A6C_POINTLIST;
1243
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1244
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1245
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1246
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1247
return V_028A6C_LINESTRIP;
1248
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1249
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1250
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1251
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1252
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1253
return V_028A6C_TRISTRIP;
1254
default:
1255
assert(0);
1256
return 0;
1257
}
1258
}
1259
1260
static uint64_t
1261
radv_dynamic_state_mask(VkDynamicState state)
1262
{
1263
switch (state) {
1264
case VK_DYNAMIC_STATE_VIEWPORT:
1265
case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
1266
return RADV_DYNAMIC_VIEWPORT;
1267
case VK_DYNAMIC_STATE_SCISSOR:
1268
case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
1269
return RADV_DYNAMIC_SCISSOR;
1270
case VK_DYNAMIC_STATE_LINE_WIDTH:
1271
return RADV_DYNAMIC_LINE_WIDTH;
1272
case VK_DYNAMIC_STATE_DEPTH_BIAS:
1273
return RADV_DYNAMIC_DEPTH_BIAS;
1274
case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1275
return RADV_DYNAMIC_BLEND_CONSTANTS;
1276
case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
1277
return RADV_DYNAMIC_DEPTH_BOUNDS;
1278
case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1279
return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1280
case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1281
return RADV_DYNAMIC_STENCIL_WRITE_MASK;
1282
case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1283
return RADV_DYNAMIC_STENCIL_REFERENCE;
1284
case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
1285
return RADV_DYNAMIC_DISCARD_RECTANGLE;
1286
case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
1287
return RADV_DYNAMIC_SAMPLE_LOCATIONS;
1288
case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
1289
return RADV_DYNAMIC_LINE_STIPPLE;
1290
case VK_DYNAMIC_STATE_CULL_MODE_EXT:
1291
return RADV_DYNAMIC_CULL_MODE;
1292
case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
1293
return RADV_DYNAMIC_FRONT_FACE;
1294
case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
1295
return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
1296
case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
1297
return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
1298
case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
1299
return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
1300
case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
1301
return RADV_DYNAMIC_DEPTH_COMPARE_OP;
1302
case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
1303
return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
1304
case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
1305
return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
1306
case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
1307
return RADV_DYNAMIC_STENCIL_OP;
1308
case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
1309
return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
1310
case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
1311
return RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1312
case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
1313
return RADV_DYNAMIC_PATCH_CONTROL_POINTS;
1314
case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
1315
return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1316
case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
1317
return RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
1318
case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
1319
return RADV_DYNAMIC_LOGIC_OP;
1320
case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
1321
return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1322
case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
1323
return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1324
default:
1325
unreachable("Unhandled dynamic state");
1326
}
1327
}
1328
1329
static bool
1330
radv_pipeline_is_blend_enabled(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1331
{
1332
const VkPipelineColorBlendStateCreateInfo *vkblend =
1333
radv_pipeline_get_color_blend_state(pCreateInfo);
1334
1335
assert(vkblend);
1336
1337
for (uint32_t i = 0; i < vkblend->attachmentCount; i++) {
1338
const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
1339
if (att->colorWriteMask && att->blendEnable)
1340
return true;
1341
}
1342
return false;
1343
}
1344
1345
static uint64_t
1346
radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1347
{
1348
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1349
struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1350
uint64_t states = RADV_DYNAMIC_ALL;
1351
1352
/* If rasterization is disabled we do not care about any of the
1353
* dynamic states, since they are all rasterization related only,
1354
* except primitive topology, primitive restart enable, vertex
1355
* binding stride and rasterization discard itself.
1356
*/
1357
if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
1358
!radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) {
1359
return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
1360
RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1361
}
1362
1363
if (!pCreateInfo->pRasterizationState->depthBiasEnable &&
1364
!radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT))
1365
states &= ~RADV_DYNAMIC_DEPTH_BIAS;
1366
1367
if (!pCreateInfo->pDepthStencilState ||
1368
(!pCreateInfo->pDepthStencilState->depthBoundsTestEnable &&
1369
!radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT)))
1370
states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
1371
1372
if (!pCreateInfo->pDepthStencilState ||
1373
(!pCreateInfo->pDepthStencilState->stencilTestEnable &&
1374
!radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT)))
1375
states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK |
1376
RADV_DYNAMIC_STENCIL_REFERENCE);
1377
1378
if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT))
1379
states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
1380
1381
if (!pCreateInfo->pMultisampleState ||
1382
!vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1383
PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
1384
states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
1385
1386
if (!pCreateInfo->pRasterizationState ||
1387
!vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
1388
PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT))
1389
states &= ~RADV_DYNAMIC_LINE_STIPPLE;
1390
1391
if (!vk_find_struct_const(pCreateInfo->pNext,
1392
PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) &&
1393
!radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR))
1394
states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
1395
1396
if (!subpass->has_color_att ||
1397
!radv_pipeline_is_blend_enabled(pCreateInfo))
1398
states &= ~RADV_DYNAMIC_BLEND_CONSTANTS;
1399
1400
if (!subpass->has_color_att)
1401
states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE;
1402
1403
return states;
1404
}
1405
1406
static struct radv_ia_multi_vgt_param_helpers
1407
radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline)
1408
{
1409
struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
1410
const struct radv_device *device = pipeline->device;
1411
1412
if (radv_pipeline_has_tess(pipeline))
1413
ia_multi_vgt_param.primgroup_size =
1414
pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
1415
else if (radv_pipeline_has_gs(pipeline))
1416
ia_multi_vgt_param.primgroup_size = 64;
1417
else
1418
ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
1419
1420
/* GS requirement. */
1421
ia_multi_vgt_param.partial_es_wave = false;
1422
if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= GFX8)
1423
if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
1424
ia_multi_vgt_param.partial_es_wave = true;
1425
1426
ia_multi_vgt_param.ia_switch_on_eoi = false;
1427
if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
1428
ia_multi_vgt_param.ia_switch_on_eoi = true;
1429
if (radv_pipeline_has_gs(pipeline) && pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
1430
ia_multi_vgt_param.ia_switch_on_eoi = true;
1431
if (radv_pipeline_has_tess(pipeline)) {
1432
/* SWITCH_ON_EOI must be set if PrimID is used. */
1433
if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
1434
radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
1435
ia_multi_vgt_param.ia_switch_on_eoi = true;
1436
}
1437
1438
ia_multi_vgt_param.partial_vs_wave = false;
1439
if (radv_pipeline_has_tess(pipeline)) {
1440
/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
1441
if ((device->physical_device->rad_info.family == CHIP_TAHITI ||
1442
device->physical_device->rad_info.family == CHIP_PITCAIRN ||
1443
device->physical_device->rad_info.family == CHIP_BONAIRE) &&
1444
radv_pipeline_has_gs(pipeline))
1445
ia_multi_vgt_param.partial_vs_wave = true;
1446
/* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
1447
if (device->physical_device->rad_info.has_distributed_tess) {
1448
if (radv_pipeline_has_gs(pipeline)) {
1449
if (device->physical_device->rad_info.chip_class <= GFX8)
1450
ia_multi_vgt_param.partial_es_wave = true;
1451
} else {
1452
ia_multi_vgt_param.partial_vs_wave = true;
1453
}
1454
}
1455
}
1456
1457
if (radv_pipeline_has_gs(pipeline)) {
1458
/* On these chips there is the possibility of a hang if the
1459
* pipeline uses a GS and partial_vs_wave is not set.
1460
*
1461
* This mostly does not hit 4-SE chips, as those typically set
1462
* ia_switch_on_eoi and then partial_vs_wave is set for pipelines
1463
* with GS due to another workaround.
1464
*
1465
* Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
1466
*/
1467
if (device->physical_device->rad_info.family == CHIP_TONGA ||
1468
device->physical_device->rad_info.family == CHIP_FIJI ||
1469
device->physical_device->rad_info.family == CHIP_POLARIS10 ||
1470
device->physical_device->rad_info.family == CHIP_POLARIS11 ||
1471
device->physical_device->rad_info.family == CHIP_POLARIS12 ||
1472
device->physical_device->rad_info.family == CHIP_VEGAM) {
1473
ia_multi_vgt_param.partial_vs_wave = true;
1474
}
1475
}
1476
1477
ia_multi_vgt_param.base =
1478
S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
1479
/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
1480
S_028AA8_MAX_PRIMGRP_IN_WAVE(device->physical_device->rad_info.chip_class == GFX8 ? 2 : 0) |
1481
S_030960_EN_INST_OPT_BASIC(device->physical_device->rad_info.chip_class >= GFX9) |
1482
S_030960_EN_INST_OPT_ADV(device->physical_device->rad_info.chip_class >= GFX9);
1483
1484
return ia_multi_vgt_param;
1485
}
1486
1487
static void
1488
radv_pipeline_init_input_assembly_state(struct radv_pipeline *pipeline,
1489
const VkGraphicsPipelineCreateInfo *pCreateInfo,
1490
const struct radv_graphics_pipeline_create_info *extra)
1491
{
1492
const VkPipelineInputAssemblyStateCreateInfo *ia_state = pCreateInfo->pInputAssemblyState;
1493
struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
1494
struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
1495
1496
pipeline->graphics.can_use_guardband = radv_prim_can_use_guardband(ia_state->topology);
1497
1498
if (radv_pipeline_has_gs(pipeline)) {
1499
if (si_conv_gl_prim_to_gs_out(gs->info.gs.output_prim) == V_028A6C_TRISTRIP)
1500
pipeline->graphics.can_use_guardband = true;
1501
} else if (radv_pipeline_has_tess(pipeline)) {
1502
if (!tes->info.tes.point_mode &&
1503
si_conv_gl_prim_to_gs_out(tes->info.tes.primitive_mode) == V_028A6C_TRISTRIP)
1504
pipeline->graphics.can_use_guardband = true;
1505
}
1506
1507
if (extra && extra->use_rectlist) {
1508
pipeline->graphics.can_use_guardband = true;
1509
}
1510
1511
pipeline->graphics.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline);
1512
}
1513
1514
static void
1515
radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
1516
const VkGraphicsPipelineCreateInfo *pCreateInfo,
1517
const struct radv_graphics_pipeline_create_info *extra)
1518
{
1519
uint64_t needed_states = radv_pipeline_needed_dynamic_state(pCreateInfo);
1520
uint64_t states = needed_states;
1521
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1522
struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1523
1524
pipeline->dynamic_state = default_dynamic_state;
1525
pipeline->graphics.needed_dynamic_state = needed_states;
1526
1527
if (pCreateInfo->pDynamicState) {
1528
/* Remove all of the states that are marked as dynamic */
1529
uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
1530
for (uint32_t s = 0; s < count; s++)
1531
states &= ~radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
1532
}
1533
1534
struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
1535
1536
if (needed_states & RADV_DYNAMIC_VIEWPORT) {
1537
assert(pCreateInfo->pViewportState);
1538
1539
dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
1540
if (states & RADV_DYNAMIC_VIEWPORT) {
1541
typed_memcpy(dynamic->viewport.viewports, pCreateInfo->pViewportState->pViewports,
1542
pCreateInfo->pViewportState->viewportCount);
1543
}
1544
}
1545
1546
if (needed_states & RADV_DYNAMIC_SCISSOR) {
1547
dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
1548
if (states & RADV_DYNAMIC_SCISSOR) {
1549
typed_memcpy(dynamic->scissor.scissors, pCreateInfo->pViewportState->pScissors,
1550
pCreateInfo->pViewportState->scissorCount);
1551
}
1552
}
1553
1554
if (states & RADV_DYNAMIC_LINE_WIDTH) {
1555
assert(pCreateInfo->pRasterizationState);
1556
dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
1557
}
1558
1559
if (states & RADV_DYNAMIC_DEPTH_BIAS) {
1560
assert(pCreateInfo->pRasterizationState);
1561
dynamic->depth_bias.bias = pCreateInfo->pRasterizationState->depthBiasConstantFactor;
1562
dynamic->depth_bias.clamp = pCreateInfo->pRasterizationState->depthBiasClamp;
1563
dynamic->depth_bias.slope = pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
1564
}
1565
1566
/* Section 9.2 of the Vulkan 1.0.15 spec says:
1567
*
1568
* pColorBlendState is [...] NULL if the pipeline has rasterization
1569
* disabled or if the subpass of the render pass the pipeline is
1570
* created against does not use any color attachments.
1571
*/
1572
if (states & RADV_DYNAMIC_BLEND_CONSTANTS) {
1573
assert(pCreateInfo->pColorBlendState);
1574
typed_memcpy(dynamic->blend_constants, pCreateInfo->pColorBlendState->blendConstants, 4);
1575
}
1576
1577
if (states & RADV_DYNAMIC_CULL_MODE) {
1578
dynamic->cull_mode = pCreateInfo->pRasterizationState->cullMode;
1579
}
1580
1581
if (states & RADV_DYNAMIC_FRONT_FACE) {
1582
dynamic->front_face = pCreateInfo->pRasterizationState->frontFace;
1583
}
1584
1585
if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
1586
dynamic->primitive_topology = si_translate_prim(pCreateInfo->pInputAssemblyState->topology);
1587
if (extra && extra->use_rectlist) {
1588
dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
1589
}
1590
}
1591
1592
/* If there is no depthstencil attachment, then don't read
1593
* pDepthStencilState. The Vulkan spec states that pDepthStencilState may
1594
* be NULL in this case. Even if pDepthStencilState is non-NULL, there is
1595
* no need to override the depthstencil defaults in
1596
* radv_pipeline::dynamic_state when there is no depthstencil attachment.
1597
*
1598
* Section 9.2 of the Vulkan 1.0.15 spec says:
1599
*
1600
* pDepthStencilState is [...] NULL if the pipeline has rasterization
1601
* disabled or if the subpass of the render pass the pipeline is created
1602
* against does not use a depth/stencil attachment.
1603
*/
1604
if (needed_states && subpass->depth_stencil_attachment) {
1605
if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
1606
dynamic->depth_bounds.min = pCreateInfo->pDepthStencilState->minDepthBounds;
1607
dynamic->depth_bounds.max = pCreateInfo->pDepthStencilState->maxDepthBounds;
1608
}
1609
1610
if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
1611
dynamic->stencil_compare_mask.front = pCreateInfo->pDepthStencilState->front.compareMask;
1612
dynamic->stencil_compare_mask.back = pCreateInfo->pDepthStencilState->back.compareMask;
1613
}
1614
1615
if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
1616
dynamic->stencil_write_mask.front = pCreateInfo->pDepthStencilState->front.writeMask;
1617
dynamic->stencil_write_mask.back = pCreateInfo->pDepthStencilState->back.writeMask;
1618
}
1619
1620
if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
1621
dynamic->stencil_reference.front = pCreateInfo->pDepthStencilState->front.reference;
1622
dynamic->stencil_reference.back = pCreateInfo->pDepthStencilState->back.reference;
1623
}
1624
1625
if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
1626
dynamic->depth_test_enable = pCreateInfo->pDepthStencilState->depthTestEnable;
1627
}
1628
1629
if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
1630
dynamic->depth_write_enable = pCreateInfo->pDepthStencilState->depthWriteEnable;
1631
}
1632
1633
if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
1634
dynamic->depth_compare_op = pCreateInfo->pDepthStencilState->depthCompareOp;
1635
}
1636
1637
if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
1638
dynamic->depth_bounds_test_enable = pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
1639
}
1640
1641
if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
1642
dynamic->stencil_test_enable = pCreateInfo->pDepthStencilState->stencilTestEnable;
1643
}
1644
1645
if (states & RADV_DYNAMIC_STENCIL_OP) {
1646
dynamic->stencil_op.front.compare_op = pCreateInfo->pDepthStencilState->front.compareOp;
1647
dynamic->stencil_op.front.fail_op = pCreateInfo->pDepthStencilState->front.failOp;
1648
dynamic->stencil_op.front.pass_op = pCreateInfo->pDepthStencilState->front.passOp;
1649
dynamic->stencil_op.front.depth_fail_op =
1650
pCreateInfo->pDepthStencilState->front.depthFailOp;
1651
1652
dynamic->stencil_op.back.compare_op = pCreateInfo->pDepthStencilState->back.compareOp;
1653
dynamic->stencil_op.back.fail_op = pCreateInfo->pDepthStencilState->back.failOp;
1654
dynamic->stencil_op.back.pass_op = pCreateInfo->pDepthStencilState->back.passOp;
1655
dynamic->stencil_op.back.depth_fail_op = pCreateInfo->pDepthStencilState->back.depthFailOp;
1656
}
1657
}
1658
1659
const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
1660
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
1661
if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1662
dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
1663
if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
1664
typed_memcpy(dynamic->discard_rectangle.rectangles,
1665
discard_rectangle_info->pDiscardRectangles,
1666
discard_rectangle_info->discardRectangleCount);
1667
}
1668
}
1669
1670
if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
1671
const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
1672
vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
1673
PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
1674
/* If sampleLocationsEnable is VK_FALSE, the default sample
1675
* locations are used and the values specified in
1676
* sampleLocationsInfo are ignored.
1677
*/
1678
if (sample_location_info->sampleLocationsEnable) {
1679
const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
1680
&sample_location_info->sampleLocationsInfo;
1681
1682
assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
1683
1684
dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
1685
dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
1686
dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
1687
typed_memcpy(&dynamic->sample_location.locations[0],
1688
pSampleLocationsInfo->pSampleLocations,
1689
pSampleLocationsInfo->sampleLocationsCount);
1690
}
1691
}
1692
1693
const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(
1694
pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
1695
if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
1696
dynamic->line_stipple.factor = rast_line_info->lineStippleFactor;
1697
dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern;
1698
}
1699
1700
if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE))
1701
pipeline->graphics.uses_dynamic_stride = true;
1702
1703
const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate = vk_find_struct_const(
1704
pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
1705
if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
1706
dynamic->fragment_shading_rate.size = shading_rate->fragmentSize;
1707
for (int i = 0; i < 2; i++)
1708
dynamic->fragment_shading_rate.combiner_ops[i] = shading_rate->combinerOps[i];
1709
}
1710
1711
if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
1712
dynamic->depth_bias_enable = pCreateInfo->pRasterizationState->depthBiasEnable;
1713
}
1714
1715
if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
1716
dynamic->primitive_restart_enable =
1717
!!pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
1718
}
1719
1720
if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
1721
dynamic->rasterizer_discard_enable =
1722
pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1723
}
1724
1725
if (subpass->has_color_att && states & RADV_DYNAMIC_LOGIC_OP) {
1726
if (pCreateInfo->pColorBlendState->logicOpEnable) {
1727
dynamic->logic_op = si_translate_blend_logic_op(pCreateInfo->pColorBlendState->logicOp);
1728
} else {
1729
dynamic->logic_op = V_028808_ROP3_COPY;
1730
}
1731
}
1732
1733
if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
1734
const VkPipelineColorWriteCreateInfoEXT *color_write_info = vk_find_struct_const(
1735
pCreateInfo->pColorBlendState->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
1736
if (color_write_info) {
1737
dynamic->color_write_enable = 0;
1738
for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
1739
dynamic->color_write_enable |=
1740
color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
1741
}
1742
}
1743
}
1744
1745
pipeline->dynamic_state.mask = states;
1746
}
1747
1748
static void
1749
radv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
1750
const VkGraphicsPipelineCreateInfo *pCreateInfo)
1751
{
1752
const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
1753
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
1754
vk_find_struct_const(raster_info->pNext,
1755
PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
1756
bool provoking_vtx_last = false;
1757
1758
if (provoking_vtx_info &&
1759
provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
1760
provoking_vtx_last = true;
1761
}
1762
1763
pipeline->graphics.pa_su_sc_mode_cntl =
1764
S_028814_FACE(raster_info->frontFace) |
1765
S_028814_CULL_FRONT(!!(raster_info->cullMode & VK_CULL_MODE_FRONT_BIT)) |
1766
S_028814_CULL_BACK(!!(raster_info->cullMode & VK_CULL_MODE_BACK_BIT)) |
1767
S_028814_POLY_MODE(raster_info->polygonMode != VK_POLYGON_MODE_FILL) |
1768
S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1769
S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(raster_info->polygonMode)) |
1770
S_028814_POLY_OFFSET_FRONT_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1771
S_028814_POLY_OFFSET_BACK_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1772
S_028814_POLY_OFFSET_PARA_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
1773
S_028814_PROVOKING_VTX_LAST(provoking_vtx_last);
1774
1775
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
1776
/* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */
1777
pipeline->graphics.pa_su_sc_mode_cntl |=
1778
S_028814_KEEP_TOGETHER_ENABLE(raster_info->polygonMode != VK_POLYGON_MODE_FILL);
1779
}
1780
1781
bool depth_clip_disable = raster_info->depthClampEnable;
1782
const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
1783
vk_find_struct_const(raster_info->pNext,
1784
PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
1785
if (depth_clip_state) {
1786
depth_clip_disable = !depth_clip_state->depthClipEnable;
1787
}
1788
1789
pipeline->graphics.pa_cl_clip_cntl =
1790
S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions.
1791
S_028810_ZCLIP_NEAR_DISABLE(depth_clip_disable ? 1 : 0) |
1792
S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
1793
S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) |
1794
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
1795
1796
pipeline->graphics.uses_conservative_overestimate =
1797
radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) ==
1798
VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
1799
}
1800
1801
static void
1802
radv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline,
1803
const VkGraphicsPipelineCreateInfo *pCreateInfo)
1804
{
1805
const VkPipelineDepthStencilStateCreateInfo *ds_info =
1806
radv_pipeline_get_depth_stencil_state(pCreateInfo);
1807
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1808
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
1809
struct radv_render_pass_attachment *attachment = NULL;
1810
uint32_t db_depth_control = 0;
1811
1812
if (subpass->depth_stencil_attachment)
1813
attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
1814
1815
bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
1816
bool has_stencil_attachment = attachment && vk_format_has_stencil(attachment->format);
1817
1818
if (ds_info) {
1819
if (has_depth_attachment) {
1820
db_depth_control = S_028800_Z_ENABLE(ds_info->depthTestEnable ? 1 : 0) |
1821
S_028800_Z_WRITE_ENABLE(ds_info->depthWriteEnable ? 1 : 0) |
1822
S_028800_ZFUNC(ds_info->depthCompareOp) |
1823
S_028800_DEPTH_BOUNDS_ENABLE(ds_info->depthBoundsTestEnable ? 1 : 0);
1824
}
1825
1826
if (has_stencil_attachment && ds_info->stencilTestEnable) {
1827
db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
1828
db_depth_control |= S_028800_STENCILFUNC(ds_info->front.compareOp);
1829
db_depth_control |= S_028800_STENCILFUNC_BF(ds_info->back.compareOp);
1830
}
1831
}
1832
1833
pipeline->graphics.db_depth_control = db_depth_control;
1834
}
1835
1836
static void
1837
gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline,
1838
nir_shader **nir, struct radv_shader_info *infos, struct gfx9_gs_info *out)
1839
{
1840
struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1841
struct radv_es_output_info *es_info;
1842
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
1843
es_info = nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1844
else
1845
es_info = nir[MESA_SHADER_TESS_CTRL] ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info
1846
: &infos[MESA_SHADER_VERTEX].vs.es_info;
1847
1848
unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
1849
bool uses_adjacency;
1850
switch (key->topology) {
1851
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1852
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1853
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1854
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1855
uses_adjacency = true;
1856
break;
1857
default:
1858
uses_adjacency = false;
1859
break;
1860
}
1861
1862
/* All these are in dwords: */
1863
/* We can't allow using the whole LDS, because GS waves compete with
1864
* other shader stages for LDS space. */
1865
const unsigned max_lds_size = 8 * 1024;
1866
const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
1867
unsigned esgs_lds_size;
1868
1869
/* All these are per subgroup: */
1870
const unsigned max_out_prims = 32 * 1024;
1871
const unsigned max_es_verts = 255;
1872
const unsigned ideal_gs_prims = 64;
1873
unsigned max_gs_prims, gs_prims;
1874
unsigned min_es_verts, es_verts, worst_case_es_verts;
1875
1876
if (uses_adjacency || gs_num_invocations > 1)
1877
max_gs_prims = 127 / gs_num_invocations;
1878
else
1879
max_gs_prims = 255;
1880
1881
/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
1882
* Make sure we don't go over the maximum value.
1883
*/
1884
if (gs_info->gs.vertices_out > 0) {
1885
max_gs_prims =
1886
MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
1887
}
1888
assert(max_gs_prims > 0);
1889
1890
/* If the primitive has adjacency, halve the number of vertices
1891
* that will be reused in multiple primitives.
1892
*/
1893
min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
1894
1895
gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
1896
worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
1897
1898
/* Compute ESGS LDS size based on the worst case number of ES vertices
1899
* needed to create the target number of GS prims per subgroup.
1900
*/
1901
esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1902
1903
/* If total LDS usage is too big, refactor partitions based on ratio
1904
* of ESGS item sizes.
1905
*/
1906
if (esgs_lds_size > max_lds_size) {
1907
/* Our target GS Prims Per Subgroup was too large. Calculate
1908
* the maximum number of GS Prims Per Subgroup that will fit
1909
* into LDS, capped by the maximum that the hardware can support.
1910
*/
1911
gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
1912
assert(gs_prims > 0);
1913
worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
1914
1915
esgs_lds_size = esgs_itemsize * worst_case_es_verts;
1916
assert(esgs_lds_size <= max_lds_size);
1917
}
1918
1919
/* Now calculate remaining ESGS information. */
1920
if (esgs_lds_size)
1921
es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
1922
else
1923
es_verts = max_es_verts;
1924
1925
/* Vertices for adjacency primitives are not always reused, so restore
1926
* it for ES_VERTS_PER_SUBGRP.
1927
*/
1928
min_es_verts = gs_info->gs.vertices_in;
1929
1930
/* For normal primitives, the VGT only checks if they are past the ES
1931
* verts per subgroup after allocating a full GS primitive and if they
1932
* are, kick off a new subgroup. But if those additional ES verts are
1933
* unique (e.g. not reused) we need to make sure there is enough LDS
1934
* space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
1935
*/
1936
es_verts -= min_es_verts - 1;
1937
1938
uint32_t es_verts_per_subgroup = es_verts;
1939
uint32_t gs_prims_per_subgroup = gs_prims;
1940
uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
1941
uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
1942
out->lds_size = align(esgs_lds_size, 128) / 128;
1943
out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
1944
S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
1945
S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
1946
out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
1947
out->vgt_esgs_ring_itemsize = esgs_itemsize;
1948
assert(max_prims_per_subgroup <= max_out_prims);
1949
}
1950
1951
static void
1952
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
1953
bool use_adjacency)
1954
{
1955
unsigned max_reuse = max_esverts - min_verts_per_prim;
1956
if (use_adjacency)
1957
max_reuse /= 2;
1958
*max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1959
}
1960
1961
static unsigned
1962
radv_get_num_input_vertices(nir_shader **nir)
1963
{
1964
if (nir[MESA_SHADER_GEOMETRY]) {
1965
nir_shader *gs = nir[MESA_SHADER_GEOMETRY];
1966
1967
return gs->info.gs.vertices_in;
1968
}
1969
1970
if (nir[MESA_SHADER_TESS_CTRL]) {
1971
nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
1972
1973
if (tes->info.tess.point_mode)
1974
return 1;
1975
if (tes->info.tess.primitive_mode == GL_ISOLINES)
1976
return 2;
1977
return 3;
1978
}
1979
1980
return 3;
1981
}
1982
1983
static void
1984
gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
1985
nir_shader **nir, struct radv_shader_info *infos, struct gfx10_ngg_info *ngg)
1986
{
1987
struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
1988
struct radv_es_output_info *es_info =
1989
nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
1990
unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
1991
unsigned max_verts_per_prim = radv_get_num_input_vertices(nir);
1992
unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1993
unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1;
1994
bool uses_adjacency;
1995
switch (key->topology) {
1996
case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1997
case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1998
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1999
case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
2000
uses_adjacency = true;
2001
break;
2002
default:
2003
uses_adjacency = false;
2004
break;
2005
}
2006
2007
/* All these are in dwords: */
2008
/* We can't allow using the whole LDS, because GS waves compete with
2009
* other shader stages for LDS space.
2010
*
2011
* TODO: We should really take the shader's internal LDS use into
2012
* account. The linker will fail if the size is greater than
2013
* 8K dwords.
2014
*/
2015
const unsigned max_lds_size = 8 * 1024 - 768;
2016
const unsigned target_lds_size = max_lds_size;
2017
unsigned esvert_lds_size = 0;
2018
unsigned gsprim_lds_size = 0;
2019
2020
/* All these are per subgroup: */
2021
const unsigned min_esverts =
2022
pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 29 : 24;
2023
bool max_vert_out_per_gs_instance = false;
2024
unsigned max_esverts_base = 256;
2025
unsigned max_gsprims_base = 128; /* default prim group size clamp */
2026
2027
/* Hardware has the following non-natural restrictions on the value
2028
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
2029
* the draw:
2030
* - at most 252 for any line input primitive type
2031
* - at most 251 for any quad input primitive type
2032
* - at most 251 for triangle strips with adjacency (this happens to
2033
* be the natural limit for triangle *lists* with adjacency)
2034
*/
2035
max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
2036
2037
if (gs_type == MESA_SHADER_GEOMETRY) {
2038
unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
2039
2040
if (max_out_verts_per_gsprim <= 256) {
2041
if (max_out_verts_per_gsprim) {
2042
max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
2043
}
2044
} else {
2045
/* Use special multi-cycling mode in which each GS
2046
* instance gets its own subgroup. Does not work with
2047
* tessellation. */
2048
max_vert_out_per_gs_instance = true;
2049
max_gsprims_base = 1;
2050
max_out_verts_per_gsprim = gs_info->gs.vertices_out;
2051
}
2052
2053
esvert_lds_size = es_info->esgs_itemsize / 4;
2054
gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
2055
} else {
2056
/* VS and TES. */
2057
/* LDS size for passing data from GS to ES. */
2058
struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL]
2059
? &infos[MESA_SHADER_TESS_EVAL].so
2060
: &infos[MESA_SHADER_VERTEX].so;
2061
2062
if (so_info->num_outputs)
2063
esvert_lds_size = 4 * so_info->num_outputs + 1;
2064
2065
/* GS stores Primitive IDs (one DWORD) into LDS at the address
2066
* corresponding to the ES thread of the provoking vertex. All
2067
* ES threads load and export PrimitiveID for their thread.
2068
*/
2069
if (!nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id)
2070
esvert_lds_size = MAX2(esvert_lds_size, 1);
2071
}
2072
2073
unsigned max_gsprims = max_gsprims_base;
2074
unsigned max_esverts = max_esverts_base;
2075
2076
if (esvert_lds_size)
2077
max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
2078
if (gsprim_lds_size)
2079
max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
2080
2081
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2082
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2083
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2084
2085
if (esvert_lds_size || gsprim_lds_size) {
2086
/* Now that we have a rough proportionality between esverts
2087
* and gsprims based on the primitive type, scale both of them
2088
* down simultaneously based on required LDS space.
2089
*
2090
* We could be smarter about this if we knew how much vertex
2091
* reuse to expect.
2092
*/
2093
unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2094
if (lds_total > target_lds_size) {
2095
max_esverts = max_esverts * target_lds_size / lds_total;
2096
max_gsprims = max_gsprims * target_lds_size / lds_total;
2097
2098
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2099
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2100
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2101
}
2102
}
2103
2104
/* Round up towards full wave sizes for better ALU utilization. */
2105
if (!max_vert_out_per_gs_instance) {
2106
unsigned orig_max_esverts;
2107
unsigned orig_max_gsprims;
2108
unsigned wavesize;
2109
2110
if (gs_type == MESA_SHADER_GEOMETRY) {
2111
wavesize = gs_info->wave_size;
2112
} else {
2113
wavesize = nir[MESA_SHADER_TESS_CTRL] ? infos[MESA_SHADER_TESS_EVAL].wave_size
2114
: infos[MESA_SHADER_VERTEX].wave_size;
2115
}
2116
2117
do {
2118
orig_max_esverts = max_esverts;
2119
orig_max_gsprims = max_gsprims;
2120
2121
max_esverts = align(max_esverts, wavesize);
2122
max_esverts = MIN2(max_esverts, max_esverts_base);
2123
if (esvert_lds_size)
2124
max_esverts =
2125
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2126
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2127
2128
/* Hardware restriction: minimum value of max_esverts */
2129
if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2130
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2131
else
2132
max_esverts = MAX2(max_esverts, min_esverts);
2133
2134
max_gsprims = align(max_gsprims, wavesize);
2135
max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2136
if (gsprim_lds_size) {
2137
/* Don't count unusable vertices to the LDS
2138
* size. Those are vertices above the maximum
2139
* number of vertices that can occur in the
2140
* workgroup, which is e.g. max_gsprims * 3
2141
* for triangles.
2142
*/
2143
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2144
max_gsprims = MIN2(max_gsprims,
2145
(max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2146
}
2147
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
2148
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2149
} while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2150
2151
/* Verify the restriction. */
2152
if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2153
assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2154
else
2155
assert(max_esverts >= min_esverts);
2156
} else {
2157
/* Hardware restriction: minimum value of max_esverts */
2158
if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2159
max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2160
else
2161
max_esverts = MAX2(max_esverts, min_esverts);
2162
}
2163
2164
unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
2165
: gs_type == MESA_SHADER_GEOMETRY
2166
? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
2167
: max_esverts;
2168
assert(max_out_vertices <= 256);
2169
2170
unsigned prim_amp_factor = 1;
2171
if (gs_type == MESA_SHADER_GEOMETRY) {
2172
/* Number of output primitives per GS input primitive after
2173
* GS instancing. */
2174
prim_amp_factor = gs_info->gs.vertices_out;
2175
}
2176
2177
/* On Gfx10, the GE only checks against the maximum number of ES verts
2178
* after allocating a full GS primitive. So we need to ensure that
2179
* whenever this check passes, there is enough space for a full
2180
* primitive without vertex reuse.
2181
*/
2182
if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
2183
ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2184
else
2185
ngg->hw_max_esverts = max_esverts;
2186
2187
ngg->max_gsprims = max_gsprims;
2188
ngg->max_out_verts = max_out_vertices;
2189
ngg->prim_amp_factor = prim_amp_factor;
2190
ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2191
ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
2192
ngg->enable_vertex_grouping = false;
2193
2194
/* Don't count unusable vertices. */
2195
ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
2196
2197
if (gs_type == MESA_SHADER_GEOMETRY) {
2198
ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
2199
} else {
2200
ngg->vgt_esgs_ring_itemsize = 1;
2201
}
2202
2203
assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
2204
}
2205
2206
static void
2207
radv_pipeline_init_gs_ring_state(struct radv_pipeline *pipeline, const struct gfx9_gs_info *gs)
2208
{
2209
struct radv_device *device = pipeline->device;
2210
unsigned num_se = device->physical_device->rad_info.max_se;
2211
unsigned wave_size = 64;
2212
unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
2213
/* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
2214
* On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
2215
*/
2216
unsigned gs_vertex_reuse =
2217
(device->physical_device->rad_info.chip_class >= GFX8 ? 32 : 16) * num_se;
2218
unsigned alignment = 256 * num_se;
2219
/* The maximum size is 63.999 MB per SE. */
2220
unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
2221
struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
2222
2223
/* Calculate the minimum size. */
2224
unsigned min_esgs_ring_size =
2225
align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment);
2226
/* These are recommended sizes, not minimum sizes. */
2227
unsigned esgs_ring_size =
2228
max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
2229
unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
2230
2231
min_esgs_ring_size = align(min_esgs_ring_size, alignment);
2232
esgs_ring_size = align(esgs_ring_size, alignment);
2233
gsvs_ring_size = align(gsvs_ring_size, alignment);
2234
2235
if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
2236
pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
2237
2238
pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
2239
}
2240
2241
struct radv_shader_variant *
2242
radv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage)
2243
{
2244
if (stage == MESA_SHADER_VERTEX) {
2245
if (pipeline->shaders[MESA_SHADER_VERTEX])
2246
return pipeline->shaders[MESA_SHADER_VERTEX];
2247
if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
2248
return pipeline->shaders[MESA_SHADER_TESS_CTRL];
2249
if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2250
return pipeline->shaders[MESA_SHADER_GEOMETRY];
2251
} else if (stage == MESA_SHADER_TESS_EVAL) {
2252
if (!radv_pipeline_has_tess(pipeline))
2253
return NULL;
2254
if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
2255
return pipeline->shaders[MESA_SHADER_TESS_EVAL];
2256
if (pipeline->shaders[MESA_SHADER_GEOMETRY])
2257
return pipeline->shaders[MESA_SHADER_GEOMETRY];
2258
}
2259
return pipeline->shaders[stage];
2260
}
2261
2262
static const struct radv_vs_output_info *
2263
get_vs_output_info(const struct radv_pipeline *pipeline)
2264
{
2265
if (radv_pipeline_has_gs(pipeline))
2266
if (radv_pipeline_has_ngg(pipeline))
2267
return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
2268
else
2269
return &pipeline->gs_copy_shader->info.vs.outinfo;
2270
else if (radv_pipeline_has_tess(pipeline))
2271
return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
2272
else
2273
return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
2274
}
2275
2276
static void
2277
radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders,
2278
bool optimize_conservatively)
2279
{
2280
nir_shader *ordered_shaders[MESA_SHADER_STAGES];
2281
int shader_count = 0;
2282
2283
if (shaders[MESA_SHADER_FRAGMENT]) {
2284
ordered_shaders[shader_count++] = shaders[MESA_SHADER_FRAGMENT];
2285
}
2286
if (shaders[MESA_SHADER_GEOMETRY]) {
2287
ordered_shaders[shader_count++] = shaders[MESA_SHADER_GEOMETRY];
2288
}
2289
if (shaders[MESA_SHADER_TESS_EVAL]) {
2290
ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_EVAL];
2291
}
2292
if (shaders[MESA_SHADER_TESS_CTRL]) {
2293
ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_CTRL];
2294
}
2295
if (shaders[MESA_SHADER_VERTEX]) {
2296
ordered_shaders[shader_count++] = shaders[MESA_SHADER_VERTEX];
2297
}
2298
if (shaders[MESA_SHADER_COMPUTE]) {
2299
ordered_shaders[shader_count++] = shaders[MESA_SHADER_COMPUTE];
2300
}
2301
2302
bool has_geom_tess = shaders[MESA_SHADER_GEOMETRY] || shaders[MESA_SHADER_TESS_CTRL];
2303
bool merged_gs = shaders[MESA_SHADER_GEOMETRY] &&
2304
pipeline->device->physical_device->rad_info.chip_class >= GFX9;
2305
2306
if (!optimize_conservatively && shader_count > 1) {
2307
unsigned first = ordered_shaders[shader_count - 1]->info.stage;
2308
unsigned last = ordered_shaders[0]->info.stage;
2309
2310
if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
2311
ordered_shaders[1]->info.has_transform_feedback_varyings)
2312
nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
2313
2314
for (int i = 1; i < shader_count; ++i) {
2315
nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]);
2316
}
2317
2318
for (int i = 0; i < shader_count; ++i) {
2319
nir_variable_mode mask = 0;
2320
2321
if (ordered_shaders[i]->info.stage != first)
2322
mask = mask | nir_var_shader_in;
2323
2324
if (ordered_shaders[i]->info.stage != last)
2325
mask = mask | nir_var_shader_out;
2326
2327
if (nir_lower_io_to_scalar_early(ordered_shaders[i], mask)) {
2328
/* Optimize the new vector code and then remove dead vars */
2329
nir_copy_prop(ordered_shaders[i]);
2330
nir_opt_shrink_vectors(ordered_shaders[i],
2331
!pipeline->device->instance->disable_shrink_image_store);
2332
2333
if (ordered_shaders[i]->info.stage != last) {
2334
/* Optimize swizzled movs of load_const for
2335
* nir_link_opt_varyings's constant propagation
2336
*/
2337
nir_opt_constant_folding(ordered_shaders[i]);
2338
/* For nir_link_opt_varyings's duplicate input opt */
2339
nir_opt_cse(ordered_shaders[i]);
2340
}
2341
2342
/* Run copy-propagation to help remove dead
2343
* output variables (some shaders have useless
2344
* copies to/from an output), so compaction
2345
* later will be more effective.
2346
*
2347
* This will have been done earlier but it might
2348
* not have worked because the outputs were vector.
2349
*/
2350
if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
2351
nir_opt_copy_prop_vars(ordered_shaders[i]);
2352
2353
nir_opt_dce(ordered_shaders[i]);
2354
nir_remove_dead_variables(
2355
ordered_shaders[i], nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
2356
NULL);
2357
}
2358
}
2359
}
2360
2361
for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
2362
if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
2363
nir_opt_constant_folding(ordered_shaders[i - 1]);
2364
nir_opt_algebraic(ordered_shaders[i - 1]);
2365
nir_opt_dce(ordered_shaders[i - 1]);
2366
}
2367
2368
nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_out, NULL);
2369
nir_remove_dead_variables(ordered_shaders[i - 1], nir_var_shader_in, NULL);
2370
2371
bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]);
2372
2373
nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true);
2374
2375
if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL ||
2376
(ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) ||
2377
(ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
2378
nir_lower_io_to_vector(ordered_shaders[i], nir_var_shader_out);
2379
if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
2380
nir_vectorize_tess_levels(ordered_shaders[i]);
2381
nir_opt_combine_stores(ordered_shaders[i], nir_var_shader_out);
2382
}
2383
if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY ||
2384
ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL ||
2385
ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) {
2386
nir_lower_io_to_vector(ordered_shaders[i - 1], nir_var_shader_in);
2387
}
2388
2389
if (progress) {
2390
if (nir_lower_global_vars_to_local(ordered_shaders[i])) {
2391
ac_nir_lower_indirect_derefs(ordered_shaders[i],
2392
pipeline->device->physical_device->rad_info.chip_class);
2393
/* remove dead writes, which can remove input loads */
2394
nir_lower_vars_to_ssa(ordered_shaders[i]);
2395
nir_opt_dce(ordered_shaders[i]);
2396
}
2397
2398
if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
2399
ac_nir_lower_indirect_derefs(ordered_shaders[i - 1],
2400
pipeline->device->physical_device->rad_info.chip_class);
2401
}
2402
}
2403
}
2404
}
2405
2406
static void
2407
radv_set_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders,
2408
struct radv_shader_info infos[MESA_SHADER_STAGES])
2409
{
2410
if (shaders[MESA_SHADER_FRAGMENT]) {
2411
nir_foreach_shader_out_variable(var, shaders[MESA_SHADER_FRAGMENT])
2412
{
2413
var->data.driver_location = var->data.location + var->data.index;
2414
}
2415
}
2416
2417
if (!shaders[MESA_SHADER_VERTEX])
2418
return;
2419
2420
bool has_tess = shaders[MESA_SHADER_TESS_CTRL];
2421
bool has_gs = shaders[MESA_SHADER_GEOMETRY];
2422
2423
/* Merged stage for VS and TES */
2424
unsigned vs_info_idx = MESA_SHADER_VERTEX;
2425
unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
2426
2427
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
2428
/* These are merged into the next stage */
2429
vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
2430
tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
2431
}
2432
2433
nir_foreach_shader_in_variable (var, shaders[MESA_SHADER_VERTEX]) {
2434
var->data.driver_location = var->data.location;
2435
}
2436
2437
if (has_tess) {
2438
nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations(
2439
shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]);
2440
nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations(
2441
shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]);
2442
2443
infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
2444
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
2445
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
2446
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
2447
infos[MESA_SHADER_TESS_EVAL].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
2448
infos[MESA_SHADER_TESS_EVAL].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
2449
2450
/* Copy data to merged stage */
2451
infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
2452
infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
2453
infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
2454
2455
if (has_gs) {
2456
nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations(
2457
shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]);
2458
2459
infos[MESA_SHADER_TESS_EVAL].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
2460
infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars;
2461
2462
/* Copy data to merged stage */
2463
infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
2464
}
2465
} else if (has_gs) {
2466
nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations(
2467
shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]);
2468
2469
infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
2470
infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars;
2471
2472
/* Copy data to merged stage */
2473
infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
2474
}
2475
2476
assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
2477
nir_foreach_shader_out_variable(var, shaders[pipeline->graphics.last_vgt_api_stage])
2478
{
2479
var->data.driver_location = var->data.location;
2480
}
2481
}
2482
2483
static uint32_t
2484
radv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state,
2485
uint32_t attrib_binding)
2486
{
2487
for (uint32_t i = 0; i < input_state->vertexBindingDescriptionCount; i++) {
2488
const VkVertexInputBindingDescription *input_binding =
2489
&input_state->pVertexBindingDescriptions[i];
2490
2491
if (input_binding->binding == attrib_binding)
2492
return input_binding->stride;
2493
}
2494
2495
return 0;
2496
}
2497
2498
static struct radv_pipeline_key
2499
radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
2500
const VkGraphicsPipelineCreateInfo *pCreateInfo,
2501
const struct radv_blend_state *blend)
2502
{
2503
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
2504
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
2505
const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState;
2506
const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state =
2507
vk_find_struct_const(input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
2508
bool uses_dynamic_stride = false;
2509
2510
struct radv_pipeline_key key;
2511
memset(&key, 0, sizeof(key));
2512
2513
if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
2514
key.optimisations_disabled = 1;
2515
2516
key.has_multiview_view_index = !!subpass->view_mask;
2517
2518
uint32_t binding_input_rate = 0;
2519
uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
2520
for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
2521
if (input_state->pVertexBindingDescriptions[i].inputRate) {
2522
unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
2523
binding_input_rate |= 1u << binding;
2524
instance_rate_divisors[binding] = 1;
2525
}
2526
}
2527
if (divisor_state) {
2528
for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
2529
instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
2530
divisor_state->pVertexBindingDivisors[i].divisor;
2531
}
2532
}
2533
2534
if (pCreateInfo->pDynamicState) {
2535
uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
2536
for (uint32_t i = 0; i < count; i++) {
2537
if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
2538
VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) {
2539
uses_dynamic_stride = true;
2540
break;
2541
}
2542
}
2543
}
2544
2545
for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
2546
const VkVertexInputAttributeDescription *desc = &input_state->pVertexAttributeDescriptions[i];
2547
const struct util_format_description *format_desc;
2548
unsigned location = desc->location;
2549
unsigned binding = desc->binding;
2550
unsigned num_format, data_format;
2551
int first_non_void;
2552
2553
if (binding_input_rate & (1u << binding)) {
2554
key.instance_rate_inputs |= 1u << location;
2555
key.instance_rate_divisors[location] = instance_rate_divisors[binding];
2556
}
2557
2558
format_desc = vk_format_description(desc->format);
2559
first_non_void = vk_format_get_first_non_void_channel(desc->format);
2560
2561
num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
2562
data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
2563
2564
key.vertex_attribute_formats[location] = data_format | (num_format << 4);
2565
key.vertex_attribute_bindings[location] = desc->binding;
2566
key.vertex_attribute_offsets[location] = desc->offset;
2567
2568
const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
2569
unsigned attrib_align =
2570
dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
2571
2572
/* If desc->offset is misaligned, then the buffer offset must be too. Just
2573
* skip updating vertex_binding_align in this case.
2574
*/
2575
if (desc->offset % attrib_align == 0)
2576
key.vertex_binding_align[desc->binding] =
2577
MAX2(key.vertex_binding_align[desc->binding], attrib_align);
2578
2579
if (!uses_dynamic_stride) {
2580
/* From the Vulkan spec 1.2.157:
2581
*
2582
* "If the bound pipeline state object was created
2583
* with the
2584
* VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT
2585
* dynamic state enabled then pStrides[i] specifies
2586
* the distance in bytes between two consecutive
2587
* elements within the corresponding buffer. In this
2588
* case the VkVertexInputBindingDescription::stride
2589
* state from the pipeline state object is ignored."
2590
*
2591
* Make sure the vertex attribute stride is zero to
2592
* avoid computing a wrong offset if it's initialized
2593
* to something else than zero.
2594
*/
2595
key.vertex_attribute_strides[location] =
2596
radv_get_attrib_stride(input_state, desc->binding);
2597
}
2598
2599
enum ac_fetch_format adjust = AC_FETCH_FORMAT_NONE;
2600
if (pipeline->device->physical_device->rad_info.chip_class <= GFX8 &&
2601
pipeline->device->physical_device->rad_info.family != CHIP_STONEY) {
2602
VkFormat format = input_state->pVertexAttributeDescriptions[i].format;
2603
switch (format) {
2604
case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
2605
case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
2606
adjust = AC_FETCH_FORMAT_SNORM;
2607
break;
2608
case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
2609
case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
2610
adjust = AC_FETCH_FORMAT_SSCALED;
2611
break;
2612
case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2613
case VK_FORMAT_A2B10G10R10_SINT_PACK32:
2614
adjust = AC_FETCH_FORMAT_SINT;
2615
break;
2616
default:
2617
break;
2618
}
2619
}
2620
key.vertex_alpha_adjust[location] = adjust;
2621
2622
switch (desc->format) {
2623
case VK_FORMAT_B8G8R8A8_UNORM:
2624
case VK_FORMAT_B8G8R8A8_SNORM:
2625
case VK_FORMAT_B8G8R8A8_USCALED:
2626
case VK_FORMAT_B8G8R8A8_SSCALED:
2627
case VK_FORMAT_B8G8R8A8_UINT:
2628
case VK_FORMAT_B8G8R8A8_SINT:
2629
case VK_FORMAT_B8G8R8A8_SRGB:
2630
case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2631
case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
2632
case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
2633
case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
2634
case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2635
case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2636
key.vertex_post_shuffle |= 1 << location;
2637
break;
2638
default:
2639
break;
2640
}
2641
}
2642
2643
const VkPipelineTessellationStateCreateInfo *tess =
2644
radv_pipeline_get_tessellation_state(pCreateInfo);
2645
if (tess)
2646
key.tess_input_vertices = tess->patchControlPoints;
2647
2648
const VkPipelineMultisampleStateCreateInfo *vkms =
2649
radv_pipeline_get_multisample_state(pCreateInfo);
2650
if (vkms && vkms->rasterizationSamples > 1) {
2651
uint32_t num_samples = vkms->rasterizationSamples;
2652
uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
2653
key.num_samples = num_samples;
2654
key.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
2655
}
2656
2657
key.col_format = blend->spi_shader_col_format;
2658
if (pipeline->device->physical_device->rad_info.chip_class < GFX8) {
2659
key.is_int8 = blend->col_format_is_int8;
2660
key.is_int10 = blend->col_format_is_int10;
2661
}
2662
2663
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
2664
key.topology = pCreateInfo->pInputAssemblyState->topology;
2665
2666
const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
2667
const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
2668
vk_find_struct_const(raster_info->pNext,
2669
PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
2670
if (provoking_vtx_info &&
2671
provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
2672
key.provoking_vtx_last = true;
2673
}
2674
}
2675
return key;
2676
}
2677
2678
static bool
2679
radv_nir_stage_uses_xfb(const nir_shader *nir)
2680
{
2681
nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
2682
bool uses_xfb = !!xfb;
2683
2684
ralloc_free(xfb);
2685
return uses_xfb;
2686
}
2687
2688
static void
2689
radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key *keys,
2690
const struct radv_pipeline_key *key, nir_shader **nir)
2691
{
2692
keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
2693
keys[MESA_SHADER_VERTEX].vs.post_shuffle = key->vertex_post_shuffle;
2694
for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i) {
2695
keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];
2696
keys[MESA_SHADER_VERTEX].vs.vertex_attribute_formats[i] = key->vertex_attribute_formats[i];
2697
keys[MESA_SHADER_VERTEX].vs.vertex_attribute_bindings[i] = key->vertex_attribute_bindings[i];
2698
keys[MESA_SHADER_VERTEX].vs.vertex_attribute_offsets[i] = key->vertex_attribute_offsets[i];
2699
keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i];
2700
keys[MESA_SHADER_VERTEX].vs.alpha_adjust[i] = key->vertex_alpha_adjust[i];
2701
}
2702
for (unsigned i = 0; i < MAX_VBS; ++i)
2703
keys[MESA_SHADER_VERTEX].vs.vertex_binding_align[i] = key->vertex_binding_align[i];
2704
keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology);
2705
keys[MESA_SHADER_VERTEX].vs.provoking_vtx_last = key->provoking_vtx_last;
2706
2707
if (nir[MESA_SHADER_TESS_CTRL]) {
2708
keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true;
2709
keys[MESA_SHADER_TESS_CTRL].tcs.input_vertices = key->tess_input_vertices;
2710
keys[MESA_SHADER_TESS_CTRL].tcs.primitive_mode =
2711
nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode;
2712
}
2713
2714
if (nir[MESA_SHADER_GEOMETRY]) {
2715
if (nir[MESA_SHADER_TESS_CTRL])
2716
keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_es = true;
2717
else
2718
keys[MESA_SHADER_VERTEX].vs_common_out.as_es = true;
2719
}
2720
2721
if (device->physical_device->use_ngg) {
2722
if (nir[MESA_SHADER_TESS_CTRL]) {
2723
keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = true;
2724
} else {
2725
keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = true;
2726
}
2727
2728
if (nir[MESA_SHADER_TESS_CTRL] && nir[MESA_SHADER_GEOMETRY] &&
2729
nir[MESA_SHADER_GEOMETRY]->info.gs.invocations *
2730
nir[MESA_SHADER_GEOMETRY]->info.gs.vertices_out >
2731
256) {
2732
/* Fallback to the legacy path if tessellation is
2733
* enabled with extreme geometry because
2734
* EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
2735
* might hang.
2736
*/
2737
keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
2738
}
2739
2740
gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
2741
2742
for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
2743
if (nir[i])
2744
last_xfb_stage = i;
2745
}
2746
2747
bool uses_xfb = nir[last_xfb_stage] && radv_nir_stage_uses_xfb(nir[last_xfb_stage]);
2748
2749
if (!device->physical_device->use_ngg_streamout && uses_xfb) {
2750
if (nir[MESA_SHADER_TESS_CTRL])
2751
keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
2752
else
2753
keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false;
2754
}
2755
2756
/* Determine if the pipeline is eligible for the NGG passthrough
2757
* mode. It can't be enabled for geometry shaders, for NGG
2758
* streamout or for vertex shaders that export the primitive ID
2759
* (this is checked later because we don't have the info here.)
2760
*/
2761
if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) {
2762
if (nir[MESA_SHADER_TESS_CTRL] && keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg) {
2763
keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg_passthrough = true;
2764
} else if (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) {
2765
keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = true;
2766
}
2767
}
2768
}
2769
2770
for (int i = 0; i < MESA_SHADER_STAGES; ++i)
2771
keys[i].has_multiview_view_index = key->has_multiview_view_index;
2772
2773
keys[MESA_SHADER_FRAGMENT].fs.col_format = key->col_format;
2774
keys[MESA_SHADER_FRAGMENT].fs.is_int8 = key->is_int8;
2775
keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10;
2776
keys[MESA_SHADER_FRAGMENT].fs.log2_ps_iter_samples = key->log2_ps_iter_samples;
2777
keys[MESA_SHADER_FRAGMENT].fs.num_samples = key->num_samples;
2778
2779
if (nir[MESA_SHADER_COMPUTE]) {
2780
unsigned subgroup_size = key->compute_subgroup_size;
2781
unsigned req_subgroup_size = subgroup_size;
2782
bool require_full_subgroups = key->require_full_subgroups;
2783
2784
if (!subgroup_size)
2785
subgroup_size = device->physical_device->cs_wave_size;
2786
2787
unsigned local_size = nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0] *
2788
nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1] *
2789
nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2];
2790
2791
/* Games don't always request full subgroups when they should,
2792
* which can cause bugs if cswave32 is enabled.
2793
*/
2794
if (device->physical_device->cs_wave_size == 32 &&
2795
nir[MESA_SHADER_COMPUTE]->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
2796
local_size % RADV_SUBGROUP_SIZE == 0)
2797
require_full_subgroups = true;
2798
2799
if (require_full_subgroups && !req_subgroup_size) {
2800
/* don't use wave32 pretending to be wave64 */
2801
subgroup_size = RADV_SUBGROUP_SIZE;
2802
}
2803
2804
keys[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size;
2805
}
2806
}
2807
2808
static uint8_t
2809
radv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
2810
gl_shader_stage stage, const struct radv_shader_variant_key *key,
2811
const struct radv_shader_info *info)
2812
{
2813
if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
2814
return 64;
2815
else if (stage == MESA_SHADER_COMPUTE) {
2816
return key->cs.subgroup_size;
2817
} else if (stage == MESA_SHADER_FRAGMENT)
2818
return device->physical_device->ps_wave_size;
2819
else
2820
return device->physical_device->ge_wave_size;
2821
}
2822
2823
static uint8_t
2824
radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
2825
gl_shader_stage stage, const struct radv_shader_variant_key *key)
2826
{
2827
if (stage == MESA_SHADER_COMPUTE && key->cs.subgroup_size)
2828
return key->cs.subgroup_size;
2829
return 64;
2830
}
2831
2832
static void
2833
radv_fill_shader_info(struct radv_pipeline *pipeline,
2834
const VkPipelineShaderStageCreateInfo **pStages,
2835
struct radv_shader_variant_key *keys, struct radv_shader_info *infos,
2836
nir_shader **nir)
2837
{
2838
unsigned active_stages = 0;
2839
unsigned filled_stages = 0;
2840
2841
for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2842
if (nir[i])
2843
active_stages |= (1 << i);
2844
}
2845
2846
if (nir[MESA_SHADER_FRAGMENT]) {
2847
radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
2848
radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline->layout,
2849
&keys[MESA_SHADER_FRAGMENT], &infos[MESA_SHADER_FRAGMENT]);
2850
2851
/* TODO: These are no longer used as keys we should refactor this */
2852
keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id =
2853
infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
2854
keys[MESA_SHADER_VERTEX].vs_common_out.export_layer_id =
2855
infos[MESA_SHADER_FRAGMENT].ps.layer_input;
2856
keys[MESA_SHADER_VERTEX].vs_common_out.export_clip_dists =
2857
!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
2858
keys[MESA_SHADER_VERTEX].vs_common_out.export_viewport_index =
2859
infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
2860
keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_prim_id =
2861
infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
2862
keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_layer_id =
2863
infos[MESA_SHADER_FRAGMENT].ps.layer_input;
2864
keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_clip_dists =
2865
!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
2866
keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_viewport_index =
2867
infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
2868
2869
/* NGG passthrough mode can't be enabled for vertex shaders
2870
* that export the primitive ID.
2871
*
2872
* TODO: I should really refactor the keys logic.
2873
*/
2874
if (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id) {
2875
keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = false;
2876
}
2877
2878
filled_stages |= (1 << MESA_SHADER_FRAGMENT);
2879
}
2880
2881
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2882
nir[MESA_SHADER_TESS_CTRL]) {
2883
struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
2884
struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
2885
key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
2886
2887
radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
2888
2889
for (int i = 0; i < 2; i++) {
2890
radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline->layout, &key,
2891
&infos[MESA_SHADER_TESS_CTRL]);
2892
}
2893
2894
filled_stages |= (1 << MESA_SHADER_VERTEX);
2895
filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
2896
}
2897
2898
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
2899
nir[MESA_SHADER_GEOMETRY]) {
2900
gl_shader_stage pre_stage =
2901
nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
2902
struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
2903
2904
radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
2905
2906
for (int i = 0; i < 2; i++) {
2907
radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline->layout,
2908
&keys[pre_stage], &infos[MESA_SHADER_GEOMETRY]);
2909
}
2910
2911
filled_stages |= (1 << pre_stage);
2912
filled_stages |= (1 << MESA_SHADER_GEOMETRY);
2913
}
2914
2915
active_stages ^= filled_stages;
2916
while (active_stages) {
2917
int i = u_bit_scan(&active_stages);
2918
radv_nir_shader_info_init(&infos[i]);
2919
radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline->layout, &keys[i], &infos[i]);
2920
}
2921
2922
for (int i = 0; i < MESA_SHADER_STAGES; i++) {
2923
if (nir[i]) {
2924
infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &keys[i], &infos[i]);
2925
infos[i].ballot_bit_size =
2926
radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &keys[i]);
2927
}
2928
}
2929
}
2930
2931
static void
2932
merge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info)
2933
{
2934
/* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
2935
*
2936
* "PointMode. Controls generation of points rather than triangles
2937
* or lines. This functionality defaults to disabled, and is
2938
* enabled if either shader stage includes the execution mode.
2939
*
2940
* and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
2941
* PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
2942
* and OutputVertices, it says:
2943
*
2944
* "One mode must be set in at least one of the tessellation
2945
* shader stages."
2946
*
2947
* So, the fields can be set in either the TCS or TES, but they must
2948
* agree if set in both. Our backend looks at TES, so bitwise-or in
2949
* the values from the TCS.
2950
*/
2951
assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 ||
2952
tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
2953
tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
2954
2955
assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
2956
tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
2957
tcs_info->tess.spacing == tes_info->tess.spacing);
2958
tes_info->tess.spacing |= tcs_info->tess.spacing;
2959
2960
assert(tcs_info->tess.primitive_mode == 0 || tes_info->tess.primitive_mode == 0 ||
2961
tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
2962
tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
2963
tes_info->tess.ccw |= tcs_info->tess.ccw;
2964
tes_info->tess.point_mode |= tcs_info->tess.point_mode;
2965
2966
/* Copy the merged info back to the TCS */
2967
tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
2968
tcs_info->tess.spacing = tes_info->tess.spacing;
2969
tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode;
2970
tcs_info->tess.ccw = tes_info->tess.ccw;
2971
tcs_info->tess.point_mode = tes_info->tess.point_mode;
2972
}
2973
2974
static void
2975
gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shader_info *infos,
2976
const struct radv_pipeline_key *pipeline_key)
2977
{
2978
merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
2979
2980
/* Number of tessellation patches per workgroup processed by the current pipeline. */
2981
unsigned num_patches = get_tcs_num_patches(
2982
pipeline_key->tess_input_vertices, nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
2983
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
2984
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
2985
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size,
2986
device->physical_device->rad_info.chip_class, device->physical_device->rad_info.family);
2987
2988
/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
2989
unsigned tcs_lds_size = calculate_tess_lds_size(
2990
device->physical_device->rad_info.chip_class, pipeline_key->tess_input_vertices,
2991
nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
2992
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, num_patches,
2993
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
2994
infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs);
2995
2996
infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches;
2997
infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size;
2998
infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors =
2999
!!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read &
3000
(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
3001
infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
3002
infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read =
3003
nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
3004
3005
infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches;
3006
infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches;
3007
3008
if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
3009
/* When the number of TCS input and output vertices are the same (typically 3):
3010
* - There is an equal amount of LS and HS invocations
3011
* - In case of merged LSHS shaders, the LS and HS halves of the shader
3012
* always process the exact same vertex. We can use this knowledge to optimize them.
3013
*
3014
* We don't set tcs_in_out_eq if the float controls differ because that might
3015
* involve different float modes for the same block and our optimizer
3016
* doesn't handle a instruction dominating another with a different mode.
3017
*/
3018
infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq =
3019
device->physical_device->rad_info.chip_class >= GFX9 &&
3020
pipeline_key->tess_input_vertices ==
3021
nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out &&
3022
nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode ==
3023
nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
3024
3025
if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq)
3026
infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask =
3027
nir[MESA_SHADER_TESS_CTRL]->info.inputs_read &
3028
nir[MESA_SHADER_VERTEX]->info.outputs_written &
3029
~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read &
3030
~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly &
3031
~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly;
3032
3033
/* Copy data to TCS so it can be accessed by the backend if they are merged. */
3034
infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq;
3035
infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask =
3036
infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask;
3037
}
3038
}
3039
3040
static void
3041
radv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT *ext)
3042
{
3043
if (!ext)
3044
return;
3045
3046
if (ext->pPipelineCreationFeedback) {
3047
ext->pPipelineCreationFeedback->flags = 0;
3048
ext->pPipelineCreationFeedback->duration = 0;
3049
}
3050
3051
for (unsigned i = 0; i < ext->pipelineStageCreationFeedbackCount; ++i) {
3052
ext->pPipelineStageCreationFeedbacks[i].flags = 0;
3053
ext->pPipelineStageCreationFeedbacks[i].duration = 0;
3054
}
3055
}
3056
3057
static void
3058
radv_start_feedback(VkPipelineCreationFeedbackEXT *feedback)
3059
{
3060
if (!feedback)
3061
return;
3062
3063
feedback->duration -= radv_get_current_time();
3064
feedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
3065
}
3066
3067
static void
3068
radv_stop_feedback(VkPipelineCreationFeedbackEXT *feedback, bool cache_hit)
3069
{
3070
if (!feedback)
3071
return;
3072
3073
feedback->duration += radv_get_current_time();
3074
feedback->flags =
3075
VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT |
3076
(cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
3077
}
3078
3079
static bool
3080
mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
3081
unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3082
void *data)
3083
{
3084
if (num_components > 4)
3085
return false;
3086
3087
/* >128 bit loads are split except with SMEM */
3088
if (bit_size * num_components > 128)
3089
return false;
3090
3091
uint32_t align;
3092
if (align_offset)
3093
align = 1 << (ffs(align_offset) - 1);
3094
else
3095
align = align_mul;
3096
3097
switch (low->intrinsic) {
3098
case nir_intrinsic_load_global:
3099
case nir_intrinsic_store_global:
3100
case nir_intrinsic_store_ssbo:
3101
case nir_intrinsic_load_ssbo:
3102
case nir_intrinsic_load_ubo:
3103
case nir_intrinsic_load_push_constant: {
3104
unsigned max_components;
3105
if (align % 4 == 0)
3106
max_components = NIR_MAX_VEC_COMPONENTS;
3107
else if (align % 2 == 0)
3108
max_components = 16u / bit_size;
3109
else
3110
max_components = 8u / bit_size;
3111
return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
3112
}
3113
case nir_intrinsic_load_deref:
3114
case nir_intrinsic_store_deref:
3115
assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
3116
FALLTHROUGH;
3117
case nir_intrinsic_load_shared:
3118
case nir_intrinsic_store_shared:
3119
if (bit_size * num_components ==
3120
96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
3121
return align % 16 == 0;
3122
} else if (bit_size == 16 && (align % 4)) {
3123
/* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
3124
* vectorization, because our vectorizer requires the scalar IR to already contain vectors.
3125
*/
3126
return (align % 2 == 0) && num_components <= 2;
3127
} else {
3128
if (num_components == 3) {
3129
/* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
3130
return false;
3131
}
3132
unsigned req = bit_size * num_components;
3133
if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
3134
req /= 2u;
3135
return align % (req / 8u) == 0;
3136
}
3137
default:
3138
return false;
3139
}
3140
return false;
3141
}
3142
3143
static unsigned
3144
lower_bit_size_callback(const nir_instr *instr, void *_)
3145
{
3146
struct radv_device *device = _;
3147
enum chip_class chip = device->physical_device->rad_info.chip_class;
3148
3149
if (instr->type != nir_instr_type_alu)
3150
return 0;
3151
nir_alu_instr *alu = nir_instr_as_alu(instr);
3152
3153
if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
3154
unsigned bit_size = alu->dest.dest.ssa.bit_size;
3155
switch (alu->op) {
3156
case nir_op_iabs:
3157
case nir_op_bitfield_select:
3158
case nir_op_imul_high:
3159
case nir_op_umul_high:
3160
case nir_op_ineg:
3161
case nir_op_isign:
3162
return 32;
3163
case nir_op_imax:
3164
case nir_op_umax:
3165
case nir_op_imin:
3166
case nir_op_umin:
3167
case nir_op_ishr:
3168
case nir_op_ushr:
3169
case nir_op_ishl:
3170
case nir_op_uadd_sat:
3171
return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3172
: 0;
3173
default:
3174
return 0;
3175
}
3176
}
3177
3178
if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
3179
unsigned bit_size = nir_src_bit_size(alu->src[0].src);
3180
switch (alu->op) {
3181
case nir_op_bit_count:
3182
case nir_op_find_lsb:
3183
case nir_op_ufind_msb:
3184
case nir_op_i2b1:
3185
return 32;
3186
case nir_op_ilt:
3187
case nir_op_ige:
3188
case nir_op_ieq:
3189
case nir_op_ine:
3190
case nir_op_ult:
3191
case nir_op_uge:
3192
return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
3193
: 0;
3194
default:
3195
return 0;
3196
}
3197
}
3198
3199
return 0;
3200
}
3201
3202
static bool
3203
opt_vectorize_callback(const nir_instr *instr, void *_)
3204
{
3205
assert(instr->type == nir_instr_type_alu);
3206
nir_alu_instr *alu = nir_instr_as_alu(instr);
3207
unsigned bit_size = alu->dest.dest.ssa.bit_size;
3208
if (bit_size != 16)
3209
return false;
3210
3211
switch (alu->op) {
3212
case nir_op_fadd:
3213
case nir_op_fsub:
3214
case nir_op_fmul:
3215
case nir_op_fneg:
3216
case nir_op_fsat:
3217
case nir_op_fmin:
3218
case nir_op_fmax:
3219
case nir_op_iadd:
3220
case nir_op_isub:
3221
case nir_op_imul:
3222
case nir_op_imin:
3223
case nir_op_imax:
3224
case nir_op_umin:
3225
case nir_op_umax:
3226
return true;
3227
case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
3228
case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
3229
case nir_op_ushr:
3230
default:
3231
return false;
3232
}
3233
}
3234
3235
static nir_component_mask_t
3236
non_uniform_access_callback(const nir_src *src, void *_)
3237
{
3238
if (src->ssa->num_components == 1)
3239
return 0x1;
3240
return nir_chase_binding(*src).success ? 0x2 : 0x3;
3241
}
3242
3243
VkResult
3244
radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
3245
struct radv_pipeline_cache *cache, const struct radv_pipeline_key *pipeline_key,
3246
const VkPipelineShaderStageCreateInfo **pStages,
3247
const VkPipelineCreateFlags flags,
3248
VkPipelineCreationFeedbackEXT *pipeline_feedback,
3249
VkPipelineCreationFeedbackEXT **stage_feedbacks)
3250
{
3251
struct vk_shader_module fs_m = {0};
3252
struct vk_shader_module *modules[MESA_SHADER_STAGES] = {
3253
0,
3254
};
3255
nir_shader *nir[MESA_SHADER_STAGES] = {0};
3256
struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
3257
struct radv_shader_variant_key keys[MESA_SHADER_STAGES] = {{{{{0}}}}};
3258
struct radv_shader_info infos[MESA_SHADER_STAGES] = {0};
3259
unsigned char hash[20], gs_copy_hash[20];
3260
bool keep_executable_info =
3261
(flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
3262
device->keep_shader_info;
3263
bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
3264
(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
3265
device->keep_shader_info;
3266
bool disable_optimizations = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
3267
3268
radv_start_feedback(pipeline_feedback);
3269
3270
for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
3271
if (pStages[i]) {
3272
modules[i] = vk_shader_module_from_handle(pStages[i]->module);
3273
if (modules[i]->nir)
3274
_mesa_sha1_compute(modules[i]->nir->info.name, strlen(modules[i]->nir->info.name),
3275
modules[i]->sha1);
3276
3277
pipeline->active_stages |= mesa_to_vk_shader_stage(i);
3278
if (i < MESA_SHADER_FRAGMENT)
3279
pipeline->graphics.last_vgt_api_stage = i;
3280
}
3281
}
3282
3283
radv_hash_shaders(hash, pStages, pipeline->layout, pipeline_key,
3284
get_hash_flags(device, keep_statistic_info));
3285
memcpy(gs_copy_hash, hash, 20);
3286
gs_copy_hash[0] ^= 1;
3287
3288
pipeline->pipeline_hash = *(uint64_t *)hash;
3289
3290
bool found_in_application_cache = true;
3291
if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) {
3292
struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
3293
radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants,
3294
&found_in_application_cache);
3295
pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
3296
}
3297
3298
if (!keep_executable_info &&
3299
radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
3300
&found_in_application_cache) &&
3301
(!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader)) {
3302
radv_stop_feedback(pipeline_feedback, found_in_application_cache);
3303
return VK_SUCCESS;
3304
}
3305
3306
if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
3307
radv_stop_feedback(pipeline_feedback, found_in_application_cache);
3308
return VK_PIPELINE_COMPILE_REQUIRED_EXT;
3309
}
3310
3311
if (!modules[MESA_SHADER_FRAGMENT] && !modules[MESA_SHADER_COMPUTE]) {
3312
nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, "noop_fs");
3313
fs_m = vk_shader_module_from_nir(fs_b.shader);
3314
modules[MESA_SHADER_FRAGMENT] = &fs_m;
3315
}
3316
3317
for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
3318
const VkPipelineShaderStageCreateInfo *stage = pStages[i];
3319
3320
if (!modules[i])
3321
continue;
3322
3323
radv_start_feedback(stage_feedbacks[i]);
3324
3325
nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i,
3326
stage ? stage->pSpecializationInfo : NULL, flags,
3327
pipeline->layout, pipeline_key);
3328
3329
/* We don't want to alter meta shaders IR directly so clone it
3330
* first.
3331
*/
3332
if (nir[i]->info.name) {
3333
nir[i] = nir_shader_clone(NULL, nir[i]);
3334
}
3335
3336
radv_stop_feedback(stage_feedbacks[i], false);
3337
}
3338
3339
bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
3340
3341
radv_link_shaders(pipeline, nir, optimize_conservatively);
3342
radv_set_driver_locations(pipeline, nir, infos);
3343
3344
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3345
if (nir[i]) {
3346
radv_start_feedback(stage_feedbacks[i]);
3347
radv_optimize_nir(device, nir[i], optimize_conservatively, false);
3348
3349
/* Gather info again, information such as outputs_read can be out-of-date. */
3350
nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
3351
radv_lower_io(device, nir[i]);
3352
3353
radv_stop_feedback(stage_feedbacks[i], false);
3354
}
3355
}
3356
3357
infos[MESA_SHADER_VERTEX].vs.as_ls = !!nir[MESA_SHADER_TESS_CTRL];
3358
infos[MESA_SHADER_VERTEX].vs.as_es = !!nir[MESA_SHADER_GEOMETRY] && !nir[MESA_SHADER_TESS_CTRL];
3359
infos[MESA_SHADER_TESS_EVAL].tes.as_es =
3360
!!nir[MESA_SHADER_GEOMETRY] && !!nir[MESA_SHADER_TESS_CTRL];
3361
3362
if (nir[MESA_SHADER_TESS_CTRL]) {
3363
nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL],
3364
nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
3365
gather_tess_info(device, nir, infos, pipeline_key);
3366
}
3367
3368
radv_fill_shader_keys(device, keys, pipeline_key, nir);
3369
radv_fill_shader_info(pipeline, pStages, keys, infos, nir);
3370
3371
bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) ||
3372
(nir[MESA_SHADER_TESS_EVAL] && keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg);
3373
3374
if (pipeline_has_ngg) {
3375
struct gfx10_ngg_info *ngg_info;
3376
3377
if (nir[MESA_SHADER_GEOMETRY])
3378
ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info;
3379
else if (nir[MESA_SHADER_TESS_CTRL])
3380
ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info;
3381
else
3382
ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info;
3383
3384
gfx10_get_ngg_info(pipeline_key, pipeline, nir, infos, ngg_info);
3385
} else if (nir[MESA_SHADER_GEOMETRY]) {
3386
struct gfx9_gs_info *gs_info = &infos[MESA_SHADER_GEOMETRY].gs_ring_info;
3387
3388
gfx9_get_gs_info(pipeline_key, pipeline, nir, infos, gs_info);
3389
}
3390
3391
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3392
if (nir[i]) {
3393
radv_start_feedback(stage_feedbacks[i]);
3394
3395
if (!radv_use_llvm_for_stage(device, i)) {
3396
nir_lower_non_uniform_access_options options = {
3397
.types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
3398
nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access,
3399
.callback = &non_uniform_access_callback,
3400
.callback_data = NULL,
3401
};
3402
NIR_PASS_V(nir[i], nir_lower_non_uniform_access, &options);
3403
}
3404
NIR_PASS_V(nir[i], nir_lower_memory_model);
3405
3406
bool lower_to_scalar = false;
3407
3408
nir_load_store_vectorize_options vectorize_opts = {
3409
.modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const |
3410
nir_var_mem_shared | nir_var_mem_global,
3411
.callback = mem_vectorize_callback,
3412
.robust_modes = 0,
3413
};
3414
3415
if (device->robust_buffer_access2) {
3416
vectorize_opts.robust_modes =
3417
nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_push_const;
3418
}
3419
3420
if (nir_opt_load_store_vectorize(nir[i], &vectorize_opts)) {
3421
NIR_PASS_V(nir[i], nir_copy_prop);
3422
lower_to_scalar = true;
3423
3424
/* Gather info again, to update whether 8/16-bit are used. */
3425
nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
3426
}
3427
3428
lower_to_scalar |=
3429
nir_opt_shrink_vectors(nir[i], !device->instance->disable_shrink_image_store);
3430
3431
if (lower_to_scalar)
3432
nir_lower_alu_to_scalar(nir[i], NULL, NULL);
3433
3434
/* lower ALU operations */
3435
/* TODO: Some 64-bit tests crash inside LLVM. */
3436
if (!radv_use_llvm_for_stage(device, i))
3437
nir_lower_int64(nir[i]);
3438
3439
/* TODO: Implement nir_op_uadd_sat with LLVM. */
3440
if (!radv_use_llvm_for_stage(device, i))
3441
nir_opt_idiv_const(nir[i], 8);
3442
3443
nir_lower_idiv(nir[i],
3444
&(nir_lower_idiv_options){
3445
.imprecise_32bit_lowering = false,
3446
.allow_fp16 = device->physical_device->rad_info.chip_class >= GFX9,
3447
});
3448
3449
nir_opt_sink(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
3450
nir_opt_move(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
3451
3452
/* Lower I/O intrinsics to memory instructions. */
3453
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
3454
bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
3455
!radv_use_llvm_for_stage(device, i);
3456
if (lowered_ngg) {
3457
uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
3458
bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
3459
radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
3460
}
3461
3462
radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
3463
3464
if (nir[i]->info.bit_sizes_int & (8 | 16)) {
3465
if (device->physical_device->rad_info.chip_class >= GFX8) {
3466
nir_convert_to_lcssa(nir[i], true, true);
3467
nir_divergence_analysis(nir[i]);
3468
}
3469
3470
if (nir_lower_bit_size(nir[i], lower_bit_size_callback, device)) {
3471
NIR_PASS_V(nir[i], nir_opt_constant_folding);
3472
NIR_PASS_V(nir[i], nir_opt_dce);
3473
}
3474
3475
if (device->physical_device->rad_info.chip_class >= GFX8)
3476
nir_opt_remove_phis(nir[i]); /* cleanup LCSSA phis */
3477
}
3478
if (((nir[i]->info.bit_sizes_int | nir[i]->info.bit_sizes_float) & 16) &&
3479
device->physical_device->rad_info.chip_class >= GFX9)
3480
NIR_PASS_V(nir[i], nir_opt_vectorize, opt_vectorize_callback, NULL);
3481
3482
/* cleanup passes */
3483
nir_lower_load_const_to_scalar(nir[i]);
3484
nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo |
3485
nir_move_load_input | nir_move_comparisons | nir_move_copies;
3486
nir_opt_sink(nir[i], move_opts | nir_move_load_ssbo);
3487
nir_opt_move(nir[i], move_opts);
3488
3489
radv_stop_feedback(stage_feedbacks[i], false);
3490
}
3491
}
3492
3493
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3494
if (radv_can_dump_shader(device, modules[i], false))
3495
nir_print_shader(nir[i], stderr);
3496
}
3497
3498
if (modules[MESA_SHADER_GEOMETRY]) {
3499
struct radv_shader_binary *gs_copy_binary = NULL;
3500
if (!pipeline_has_ngg) {
3501
struct radv_shader_info info = {0};
3502
struct radv_shader_variant_key key = {0};
3503
3504
key.has_multiview_view_index = keys[MESA_SHADER_GEOMETRY].has_multiview_view_index;
3505
3506
radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline->layout, &key,
3507
&info);
3508
info.wave_size = 64; /* Wave32 not supported. */
3509
info.ballot_bit_size = 64;
3510
3511
pipeline->gs_copy_shader = radv_create_gs_copy_shader(
3512
device, nir[MESA_SHADER_GEOMETRY], &info, &gs_copy_binary, keep_executable_info,
3513
keep_statistic_info, keys[MESA_SHADER_GEOMETRY].has_multiview_view_index,
3514
disable_optimizations);
3515
}
3516
3517
if (!keep_executable_info && pipeline->gs_copy_shader) {
3518
struct radv_shader_binary *gs_binaries[MESA_SHADER_STAGES] = {NULL};
3519
struct radv_shader_variant *gs_variants[MESA_SHADER_STAGES] = {0};
3520
3521
gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
3522
gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;
3523
3524
radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries);
3525
3526
pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY];
3527
}
3528
free(gs_copy_binary);
3529
}
3530
3531
if (nir[MESA_SHADER_FRAGMENT]) {
3532
if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
3533
radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]);
3534
3535
pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile(
3536
device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline->layout,
3537
keys + MESA_SHADER_FRAGMENT, infos + MESA_SHADER_FRAGMENT, keep_executable_info,
3538
keep_statistic_info, disable_optimizations, &binaries[MESA_SHADER_FRAGMENT]);
3539
3540
radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false);
3541
}
3542
}
3543
3544
if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) {
3545
if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) {
3546
struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
3547
struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
3548
key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
3549
3550
radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]);
3551
3552
pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(
3553
device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline->layout, &key,
3554
&infos[MESA_SHADER_TESS_CTRL], keep_executable_info, keep_statistic_info,
3555
disable_optimizations, &binaries[MESA_SHADER_TESS_CTRL]);
3556
3557
radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
3558
}
3559
modules[MESA_SHADER_VERTEX] = NULL;
3560
}
3561
3562
if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) {
3563
gl_shader_stage pre_stage =
3564
modules[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
3565
if (!pipeline->shaders[MESA_SHADER_GEOMETRY]) {
3566
struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
3567
3568
radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]);
3569
3570
pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(
3571
device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline->layout,
3572
&keys[pre_stage], &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
3573
keep_statistic_info, disable_optimizations, &binaries[MESA_SHADER_GEOMETRY]);
3574
3575
radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false);
3576
}
3577
modules[pre_stage] = NULL;
3578
}
3579
3580
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3581
if (modules[i] && !pipeline->shaders[i]) {
3582
radv_start_feedback(stage_feedbacks[i]);
3583
3584
pipeline->shaders[i] = radv_shader_variant_compile(
3585
device, modules[i], &nir[i], 1, pipeline->layout, keys + i, infos + i,
3586
keep_executable_info, keep_statistic_info, disable_optimizations, &binaries[i]);
3587
3588
radv_stop_feedback(stage_feedbacks[i], false);
3589
}
3590
}
3591
3592
if (!keep_executable_info) {
3593
radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries);
3594
}
3595
3596
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
3597
free(binaries[i]);
3598
if (nir[i]) {
3599
ralloc_free(nir[i]);
3600
3601
if (radv_can_dump_shader_stats(device, modules[i])) {
3602
radv_dump_shader_stats(device, pipeline, i, stderr);
3603
}
3604
}
3605
}
3606
3607
if (fs_m.nir)
3608
ralloc_free(fs_m.nir);
3609
3610
radv_stop_feedback(pipeline_feedback, false);
3611
return VK_SUCCESS;
3612
}
3613
3614
static uint32_t
3615
radv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_stage stage,
3616
enum chip_class chip_class)
3617
{
3618
bool has_gs = radv_pipeline_has_gs(pipeline);
3619
bool has_tess = radv_pipeline_has_tess(pipeline);
3620
bool has_ngg = radv_pipeline_has_ngg(pipeline);
3621
3622
switch (stage) {
3623
case MESA_SHADER_FRAGMENT:
3624
return R_00B030_SPI_SHADER_USER_DATA_PS_0;
3625
case MESA_SHADER_VERTEX:
3626
if (has_tess) {
3627
if (chip_class >= GFX10) {
3628
return R_00B430_SPI_SHADER_USER_DATA_HS_0;
3629
} else if (chip_class == GFX9) {
3630
return R_00B430_SPI_SHADER_USER_DATA_LS_0;
3631
} else {
3632
return R_00B530_SPI_SHADER_USER_DATA_LS_0;
3633
}
3634
}
3635
3636
if (has_gs) {
3637
if (chip_class >= GFX10) {
3638
return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3639
} else {
3640
return R_00B330_SPI_SHADER_USER_DATA_ES_0;
3641
}
3642
}
3643
3644
if (has_ngg)
3645
return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3646
3647
return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3648
case MESA_SHADER_GEOMETRY:
3649
return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
3650
: R_00B230_SPI_SHADER_USER_DATA_GS_0;
3651
case MESA_SHADER_COMPUTE:
3652
return R_00B900_COMPUTE_USER_DATA_0;
3653
case MESA_SHADER_TESS_CTRL:
3654
return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
3655
: R_00B430_SPI_SHADER_USER_DATA_HS_0;
3656
case MESA_SHADER_TESS_EVAL:
3657
if (has_gs) {
3658
return chip_class >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0
3659
: R_00B330_SPI_SHADER_USER_DATA_ES_0;
3660
} else if (has_ngg) {
3661
return R_00B230_SPI_SHADER_USER_DATA_GS_0;
3662
} else {
3663
return R_00B130_SPI_SHADER_USER_DATA_VS_0;
3664
}
3665
default:
3666
unreachable("unknown shader");
3667
}
3668
}
3669
3670
struct radv_bin_size_entry {
3671
unsigned bpp;
3672
VkExtent2D extent;
3673
};
3674
3675
static VkExtent2D
3676
radv_gfx9_compute_bin_size(const struct radv_pipeline *pipeline,
3677
const VkGraphicsPipelineCreateInfo *pCreateInfo)
3678
{
3679
static const struct radv_bin_size_entry color_size_table[][3][9] = {
3680
{
3681
/* One RB / SE */
3682
{
3683
/* One shader engine */
3684
{0, {128, 128}},
3685
{1, {64, 128}},
3686
{2, {32, 128}},
3687
{3, {16, 128}},
3688
{17, {0, 0}},
3689
{UINT_MAX, {0, 0}},
3690
},
3691
{
3692
/* Two shader engines */
3693
{0, {128, 128}},
3694
{2, {64, 128}},
3695
{3, {32, 128}},
3696
{5, {16, 128}},
3697
{17, {0, 0}},
3698
{UINT_MAX, {0, 0}},
3699
},
3700
{
3701
/* Four shader engines */
3702
{0, {128, 128}},
3703
{3, {64, 128}},
3704
{5, {16, 128}},
3705
{17, {0, 0}},
3706
{UINT_MAX, {0, 0}},
3707
},
3708
},
3709
{
3710
/* Two RB / SE */
3711
{
3712
/* One shader engine */
3713
{0, {128, 128}},
3714
{2, {64, 128}},
3715
{3, {32, 128}},
3716
{5, {16, 128}},
3717
{33, {0, 0}},
3718
{UINT_MAX, {0, 0}},
3719
},
3720
{
3721
/* Two shader engines */
3722
{0, {128, 128}},
3723
{3, {64, 128}},
3724
{5, {32, 128}},
3725
{9, {16, 128}},
3726
{33, {0, 0}},
3727
{UINT_MAX, {0, 0}},
3728
},
3729
{
3730
/* Four shader engines */
3731
{0, {256, 256}},
3732
{2, {128, 256}},
3733
{3, {128, 128}},
3734
{5, {64, 128}},
3735
{9, {16, 128}},
3736
{33, {0, 0}},
3737
{UINT_MAX, {0, 0}},
3738
},
3739
},
3740
{
3741
/* Four RB / SE */
3742
{
3743
/* One shader engine */
3744
{0, {128, 256}},
3745
{2, {128, 128}},
3746
{3, {64, 128}},
3747
{5, {32, 128}},
3748
{9, {16, 128}},
3749
{33, {0, 0}},
3750
{UINT_MAX, {0, 0}},
3751
},
3752
{
3753
/* Two shader engines */
3754
{0, {256, 256}},
3755
{2, {128, 256}},
3756
{3, {128, 128}},
3757
{5, {64, 128}},
3758
{9, {32, 128}},
3759
{17, {16, 128}},
3760
{33, {0, 0}},
3761
{UINT_MAX, {0, 0}},
3762
},
3763
{
3764
/* Four shader engines */
3765
{0, {256, 512}},
3766
{2, {256, 256}},
3767
{3, {128, 256}},
3768
{5, {128, 128}},
3769
{9, {64, 128}},
3770
{17, {16, 128}},
3771
{33, {0, 0}},
3772
{UINT_MAX, {0, 0}},
3773
},
3774
},
3775
};
3776
static const struct radv_bin_size_entry ds_size_table[][3][9] = {
3777
{
3778
// One RB / SE
3779
{
3780
// One shader engine
3781
{0, {128, 256}},
3782
{2, {128, 128}},
3783
{4, {64, 128}},
3784
{7, {32, 128}},
3785
{13, {16, 128}},
3786
{49, {0, 0}},
3787
{UINT_MAX, {0, 0}},
3788
},
3789
{
3790
// Two shader engines
3791
{0, {256, 256}},
3792
{2, {128, 256}},
3793
{4, {128, 128}},
3794
{7, {64, 128}},
3795
{13, {32, 128}},
3796
{25, {16, 128}},
3797
{49, {0, 0}},
3798
{UINT_MAX, {0, 0}},
3799
},
3800
{
3801
// Four shader engines
3802
{0, {256, 512}},
3803
{2, {256, 256}},
3804
{4, {128, 256}},
3805
{7, {128, 128}},
3806
{13, {64, 128}},
3807
{25, {16, 128}},
3808
{49, {0, 0}},
3809
{UINT_MAX, {0, 0}},
3810
},
3811
},
3812
{
3813
// Two RB / SE
3814
{
3815
// One shader engine
3816
{0, {256, 256}},
3817
{2, {128, 256}},
3818
{4, {128, 128}},
3819
{7, {64, 128}},
3820
{13, {32, 128}},
3821
{25, {16, 128}},
3822
{97, {0, 0}},
3823
{UINT_MAX, {0, 0}},
3824
},
3825
{
3826
// Two shader engines
3827
{0, {256, 512}},
3828
{2, {256, 256}},
3829
{4, {128, 256}},
3830
{7, {128, 128}},
3831
{13, {64, 128}},
3832
{25, {32, 128}},
3833
{49, {16, 128}},
3834
{97, {0, 0}},
3835
{UINT_MAX, {0, 0}},
3836
},
3837
{
3838
// Four shader engines
3839
{0, {512, 512}},
3840
{2, {256, 512}},
3841
{4, {256, 256}},
3842
{7, {128, 256}},
3843
{13, {128, 128}},
3844
{25, {64, 128}},
3845
{49, {16, 128}},
3846
{97, {0, 0}},
3847
{UINT_MAX, {0, 0}},
3848
},
3849
},
3850
{
3851
// Four RB / SE
3852
{
3853
// One shader engine
3854
{0, {256, 512}},
3855
{2, {256, 256}},
3856
{4, {128, 256}},
3857
{7, {128, 128}},
3858
{13, {64, 128}},
3859
{25, {32, 128}},
3860
{49, {16, 128}},
3861
{UINT_MAX, {0, 0}},
3862
},
3863
{
3864
// Two shader engines
3865
{0, {512, 512}},
3866
{2, {256, 512}},
3867
{4, {256, 256}},
3868
{7, {128, 256}},
3869
{13, {128, 128}},
3870
{25, {64, 128}},
3871
{49, {32, 128}},
3872
{97, {16, 128}},
3873
{UINT_MAX, {0, 0}},
3874
},
3875
{
3876
// Four shader engines
3877
{0, {512, 512}},
3878
{4, {256, 512}},
3879
{7, {256, 256}},
3880
{13, {128, 256}},
3881
{25, {128, 128}},
3882
{49, {64, 128}},
3883
{97, {16, 128}},
3884
{UINT_MAX, {0, 0}},
3885
},
3886
},
3887
};
3888
3889
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3890
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3891
VkExtent2D extent = {512, 512};
3892
3893
unsigned log_num_rb_per_se =
3894
util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_render_backends /
3895
pipeline->device->physical_device->rad_info.max_se);
3896
unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
3897
3898
unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
3899
unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
3900
unsigned effective_samples = total_samples;
3901
unsigned color_bytes_per_pixel = 0;
3902
3903
const VkPipelineColorBlendStateCreateInfo *vkblend =
3904
radv_pipeline_get_color_blend_state(pCreateInfo);
3905
if (vkblend) {
3906
for (unsigned i = 0; i < subpass->color_count; i++) {
3907
if (!vkblend->pAttachments[i].colorWriteMask)
3908
continue;
3909
3910
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3911
continue;
3912
3913
VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3914
color_bytes_per_pixel += vk_format_get_blocksize(format);
3915
}
3916
3917
/* MSAA images typically don't use all samples all the time. */
3918
if (effective_samples >= 2 && ps_iter_samples <= 1)
3919
effective_samples = 2;
3920
color_bytes_per_pixel *= effective_samples;
3921
}
3922
3923
const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
3924
while (color_entry[1].bpp <= color_bytes_per_pixel)
3925
++color_entry;
3926
3927
extent = color_entry->extent;
3928
3929
if (subpass->depth_stencil_attachment) {
3930
struct radv_render_pass_attachment *attachment =
3931
pass->attachments + subpass->depth_stencil_attachment->attachment;
3932
3933
/* Coefficients taken from AMDVLK */
3934
unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
3935
unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
3936
unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
3937
3938
const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
3939
while (ds_entry[1].bpp <= ds_bytes_per_pixel)
3940
++ds_entry;
3941
3942
if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
3943
extent = ds_entry->extent;
3944
}
3945
3946
return extent;
3947
}
3948
3949
static VkExtent2D
3950
radv_gfx10_compute_bin_size(const struct radv_pipeline *pipeline,
3951
const VkGraphicsPipelineCreateInfo *pCreateInfo)
3952
{
3953
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
3954
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
3955
VkExtent2D extent = {512, 512};
3956
3957
const unsigned db_tag_size = 64;
3958
const unsigned db_tag_count = 312;
3959
const unsigned color_tag_size = 1024;
3960
const unsigned color_tag_count = 31;
3961
const unsigned fmask_tag_size = 256;
3962
const unsigned fmask_tag_count = 44;
3963
3964
const unsigned rb_count = pipeline->device->physical_device->rad_info.max_render_backends;
3965
const unsigned pipe_count =
3966
MAX2(rb_count, pipeline->device->physical_device->rad_info.num_tcc_blocks);
3967
3968
const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
3969
const unsigned color_tag_part =
3970
(color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
3971
const unsigned fmask_tag_part =
3972
(fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
3973
3974
const unsigned total_samples =
3975
1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
3976
const unsigned samples_log = util_logbase2_ceil(total_samples);
3977
3978
unsigned color_bytes_per_pixel = 0;
3979
unsigned fmask_bytes_per_pixel = 0;
3980
3981
const VkPipelineColorBlendStateCreateInfo *vkblend =
3982
radv_pipeline_get_color_blend_state(pCreateInfo);
3983
if (vkblend) {
3984
for (unsigned i = 0; i < subpass->color_count; i++) {
3985
if (!vkblend->pAttachments[i].colorWriteMask)
3986
continue;
3987
3988
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
3989
continue;
3990
3991
VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
3992
color_bytes_per_pixel += vk_format_get_blocksize(format);
3993
3994
if (total_samples > 1) {
3995
assert(samples_log <= 3);
3996
const unsigned fmask_array[] = {0, 1, 1, 4};
3997
fmask_bytes_per_pixel += fmask_array[samples_log];
3998
}
3999
}
4000
4001
color_bytes_per_pixel *= total_samples;
4002
}
4003
color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
4004
4005
const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
4006
extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
4007
extent.height = 1ull << (color_pixel_count_log / 2);
4008
4009
if (fmask_bytes_per_pixel) {
4010
const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
4011
4012
const VkExtent2D fmask_extent =
4013
(VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
4014
.height = 1ull << (color_pixel_count_log / 2)};
4015
4016
if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
4017
extent = fmask_extent;
4018
}
4019
4020
if (subpass->depth_stencil_attachment) {
4021
struct radv_render_pass_attachment *attachment =
4022
pass->attachments + subpass->depth_stencil_attachment->attachment;
4023
4024
/* Coefficients taken from AMDVLK */
4025
unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
4026
unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
4027
unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
4028
4029
const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
4030
4031
const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2),
4032
.height = 1ull << (color_pixel_count_log / 2)};
4033
4034
if (db_extent.width * db_extent.height < extent.width * extent.height)
4035
extent = db_extent;
4036
}
4037
4038
extent.width = MAX2(extent.width, 128);
4039
extent.height = MAX2(extent.width, 64);
4040
4041
return extent;
4042
}
4043
4044
static void
4045
radv_pipeline_init_disabled_binning_state(struct radv_pipeline *pipeline,
4046
const VkGraphicsPipelineCreateInfo *pCreateInfo)
4047
{
4048
uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
4049
S_028C44_DISABLE_START_OF_PRIM(1);
4050
4051
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4052
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4053
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4054
const VkPipelineColorBlendStateCreateInfo *vkblend =
4055
radv_pipeline_get_color_blend_state(pCreateInfo);
4056
unsigned min_bytes_per_pixel = 0;
4057
4058
if (vkblend) {
4059
for (unsigned i = 0; i < subpass->color_count; i++) {
4060
if (!vkblend->pAttachments[i].colorWriteMask)
4061
continue;
4062
4063
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
4064
continue;
4065
4066
VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
4067
unsigned bytes = vk_format_get_blocksize(format);
4068
if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
4069
min_bytes_per_pixel = bytes;
4070
}
4071
}
4072
4073
pa_sc_binner_cntl_0 =
4074
S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
4075
S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
4076
S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
4077
S_028C44_DISABLE_START_OF_PRIM(1);
4078
}
4079
4080
pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
4081
}
4082
4083
struct radv_binning_settings
4084
radv_get_binning_settings(const struct radv_physical_device *pdev)
4085
{
4086
struct radv_binning_settings settings;
4087
if (pdev->rad_info.has_dedicated_vram) {
4088
if (pdev->rad_info.max_render_backends > 4) {
4089
settings.context_states_per_bin = 1;
4090
settings.persistent_states_per_bin = 1;
4091
} else {
4092
settings.context_states_per_bin = 3;
4093
settings.persistent_states_per_bin = 8;
4094
}
4095
settings.fpovs_per_batch = 63;
4096
} else {
4097
/* The context states are affected by the scissor bug. */
4098
settings.context_states_per_bin = 6;
4099
/* 32 causes hangs for RAVEN. */
4100
settings.persistent_states_per_bin = 16;
4101
settings.fpovs_per_batch = 63;
4102
}
4103
4104
if (pdev->rad_info.has_gfx9_scissor_bug)
4105
settings.context_states_per_bin = 1;
4106
4107
return settings;
4108
}
4109
4110
static void
4111
radv_pipeline_init_binning_state(struct radv_pipeline *pipeline,
4112
const VkGraphicsPipelineCreateInfo *pCreateInfo,
4113
const struct radv_blend_state *blend)
4114
{
4115
if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
4116
return;
4117
4118
VkExtent2D bin_size;
4119
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4120
bin_size = radv_gfx10_compute_bin_size(pipeline, pCreateInfo);
4121
} else if (pipeline->device->physical_device->rad_info.chip_class == GFX9) {
4122
bin_size = radv_gfx9_compute_bin_size(pipeline, pCreateInfo);
4123
} else
4124
unreachable("Unhandled generation for binning bin size calculation");
4125
4126
if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) {
4127
struct radv_binning_settings settings =
4128
radv_get_binning_settings(pipeline->device->physical_device);
4129
4130
const uint32_t pa_sc_binner_cntl_0 =
4131
S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
4132
S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
4133
S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
4134
S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
4135
S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
4136
S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
4137
S_028C44_DISABLE_START_OF_PRIM(1) |
4138
S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1);
4139
4140
pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
4141
} else
4142
radv_pipeline_init_disabled_binning_state(pipeline, pCreateInfo);
4143
}
4144
4145
static void
4146
radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
4147
const struct radv_pipeline *pipeline,
4148
const VkGraphicsPipelineCreateInfo *pCreateInfo,
4149
const struct radv_graphics_pipeline_create_info *extra)
4150
{
4151
const VkPipelineDepthStencilStateCreateInfo *vkds =
4152
radv_pipeline_get_depth_stencil_state(pCreateInfo);
4153
RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4154
struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4155
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4156
struct radv_render_pass_attachment *attachment = NULL;
4157
uint32_t db_render_control = 0, db_render_override2 = 0;
4158
uint32_t db_render_override = 0;
4159
4160
if (subpass->depth_stencil_attachment)
4161
attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
4162
4163
bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
4164
4165
if (vkds && has_depth_attachment) {
4166
/* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
4167
db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2);
4168
4169
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
4170
db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1);
4171
}
4172
4173
if (attachment && extra) {
4174
db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
4175
db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
4176
4177
db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
4178
db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
4179
db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
4180
db_render_override2 |=
4181
S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(extra->db_depth_disable_expclear);
4182
db_render_override2 |=
4183
S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(extra->db_stencil_disable_expclear);
4184
}
4185
4186
db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
4187
S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
4188
4189
if (!pCreateInfo->pRasterizationState->depthClampEnable && ps->info.ps.writes_z) {
4190
/* From VK_EXT_depth_range_unrestricted spec:
4191
*
4192
* "The behavior described in Primitive Clipping still applies.
4193
* If depth clamping is disabled the depth values are still
4194
* clipped to 0 ≤ zc ≤ wc before the viewport transform. If
4195
* depth clamping is enabled the above equation is ignored and
4196
* the depth values are instead clamped to the VkViewport
4197
* minDepth and maxDepth values, which in the case of this
4198
* extension can be outside of the 0.0 to 1.0 range."
4199
*/
4200
db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
4201
}
4202
4203
radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4204
radeon_set_context_reg(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, db_render_override);
4205
radeon_set_context_reg(ctx_cs, R_028010_DB_RENDER_OVERRIDE2, db_render_override2);
4206
}
4207
4208
static void
4209
radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
4210
const struct radv_pipeline *pipeline,
4211
const struct radv_blend_state *blend)
4212
{
4213
radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
4214
radeon_emit_array(ctx_cs, blend->cb_blend_control, 8);
4215
radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
4216
4217
if (pipeline->device->physical_device->rad_info.has_rbplus) {
4218
4219
radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
4220
radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
4221
}
4222
4223
radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
4224
4225
radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
4226
}
4227
4228
static void
4229
radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
4230
const struct radv_pipeline *pipeline,
4231
const VkGraphicsPipelineCreateInfo *pCreateInfo)
4232
{
4233
const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState;
4234
const VkConservativeRasterizationModeEXT mode = radv_get_conservative_raster_mode(vkraster);
4235
uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
4236
4237
radeon_set_context_reg(ctx_cs, R_028BDC_PA_SC_LINE_CNTL, S_028BDC_DX10_DIAMOND_TEST_ENA(1));
4238
4239
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4240
/* Conservative rasterization. */
4241
if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
4242
pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
4243
S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
4244
4245
if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
4246
pa_sc_conservative_rast |=
4247
S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
4248
S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
4249
S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
4250
} else {
4251
assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
4252
pa_sc_conservative_rast |=
4253
S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
4254
S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
4255
S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
4256
}
4257
}
4258
4259
radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
4260
pa_sc_conservative_rast);
4261
}
4262
}
4263
4264
static void
4265
radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs,
4266
const struct radv_pipeline *pipeline)
4267
{
4268
const struct radv_multisample_state *ms = &pipeline->graphics.ms;
4269
4270
radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4271
radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
4272
radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
4273
4274
radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
4275
radeon_set_context_reg(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, ms->pa_sc_mode_cntl_0);
4276
radeon_set_context_reg(ctx_cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
4277
radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
4278
4279
/* The exclusion bits can be set to improve rasterization efficiency
4280
* if no sample lies on the pixel boundary (-8 sample offset). It's
4281
* currently always TRUE because the driver doesn't support 16 samples.
4282
*/
4283
bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= GFX7;
4284
radeon_set_context_reg(
4285
ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
4286
S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
4287
}
4288
4289
static void
4290
radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
4291
const struct radv_pipeline *pipeline)
4292
{
4293
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4294
const struct radv_shader_variant *vs = pipeline->shaders[MESA_SHADER_TESS_EVAL]
4295
? pipeline->shaders[MESA_SHADER_TESS_EVAL]
4296
: pipeline->shaders[MESA_SHADER_VERTEX];
4297
unsigned vgt_primitiveid_en = 0;
4298
uint32_t vgt_gs_mode = 0;
4299
4300
if (radv_pipeline_has_ngg(pipeline))
4301
return;
4302
4303
if (radv_pipeline_has_gs(pipeline)) {
4304
const struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4305
4306
vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
4307
pipeline->device->physical_device->rad_info.chip_class);
4308
} else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
4309
vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
4310
vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
4311
}
4312
4313
radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
4314
radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
4315
}
4316
4317
static void
4318
radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4319
const struct radv_pipeline *pipeline,
4320
const struct radv_shader_variant *shader)
4321
{
4322
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4323
4324
radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
4325
radeon_emit(cs, va >> 8);
4326
radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
4327
radeon_emit(cs, shader->config.rsrc1);
4328
radeon_emit(cs, shader->config.rsrc2);
4329
4330
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4331
unsigned clip_dist_mask, cull_dist_mask, total_mask;
4332
clip_dist_mask = outinfo->clip_dist_mask;
4333
cull_dist_mask = outinfo->cull_dist_mask;
4334
total_mask = clip_dist_mask | cull_dist_mask;
4335
4336
bool writes_primitive_shading_rate =
4337
outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
4338
bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
4339
outinfo->writes_viewport_index || writes_primitive_shading_rate;
4340
unsigned spi_vs_out_config, nparams;
4341
4342
/* VS is required to export at least one param. */
4343
nparams = MAX2(outinfo->param_exports, 1);
4344
spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
4345
4346
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4347
spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
4348
}
4349
4350
radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
4351
4352
radeon_set_context_reg(
4353
ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
4354
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
4355
S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
4356
: V_02870C_SPI_SHADER_NONE) |
4357
S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
4358
: V_02870C_SPI_SHADER_NONE) |
4359
S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
4360
: V_02870C_SPI_SHADER_NONE));
4361
4362
radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
4363
S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
4364
S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
4365
S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
4366
S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
4367
S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
4368
S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
4369
S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
4370
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
4371
cull_dist_mask << 8 | clip_dist_mask);
4372
4373
if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
4374
radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
4375
}
4376
4377
static void
4378
radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4379
const struct radv_shader_variant *shader)
4380
{
4381
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4382
4383
radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
4384
radeon_emit(cs, va >> 8);
4385
radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
4386
radeon_emit(cs, shader->config.rsrc1);
4387
radeon_emit(cs, shader->config.rsrc2);
4388
}
4389
4390
static void
4391
radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4392
const struct radv_shader_variant *shader)
4393
{
4394
unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
4395
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4396
uint32_t rsrc2 = shader->config.rsrc2;
4397
4398
radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
4399
radeon_emit(cs, va >> 8);
4400
radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
4401
4402
rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
4403
if (pipeline->device->physical_device->rad_info.chip_class == GFX7 &&
4404
pipeline->device->physical_device->rad_info.family != CHIP_HAWAII)
4405
radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
4406
4407
radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
4408
radeon_emit(cs, shader->config.rsrc1);
4409
radeon_emit(cs, rsrc2);
4410
}
4411
4412
static void
4413
radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4414
const struct radv_pipeline *pipeline,
4415
const struct radv_shader_variant *shader)
4416
{
4417
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4418
gl_shader_stage es_type =
4419
radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
4420
struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL
4421
? pipeline->shaders[MESA_SHADER_TESS_EVAL]
4422
: pipeline->shaders[MESA_SHADER_VERTEX];
4423
const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
4424
4425
radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
4426
radeon_emit(cs, va >> 8);
4427
radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
4428
radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
4429
radeon_emit(cs, shader->config.rsrc1);
4430
radeon_emit(cs, shader->config.rsrc2);
4431
4432
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4433
unsigned clip_dist_mask, cull_dist_mask, total_mask;
4434
clip_dist_mask = outinfo->clip_dist_mask;
4435
cull_dist_mask = outinfo->cull_dist_mask;
4436
total_mask = clip_dist_mask | cull_dist_mask;
4437
4438
bool writes_primitive_shading_rate =
4439
outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
4440
bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
4441
outinfo->writes_viewport_index || writes_primitive_shading_rate;
4442
bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
4443
bool break_wave_at_eoi = false;
4444
unsigned ge_cntl;
4445
unsigned nparams;
4446
4447
if (es_type == MESA_SHADER_TESS_EVAL) {
4448
struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4449
4450
if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
4451
break_wave_at_eoi = true;
4452
}
4453
4454
nparams = MAX2(outinfo->param_exports, 1);
4455
radeon_set_context_reg(
4456
ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
4457
S_0286C4_VS_EXPORT_COUNT(nparams - 1) | S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0));
4458
4459
radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
4460
S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
4461
radeon_set_context_reg(
4462
ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
4463
S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
4464
S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
4465
: V_02870C_SPI_SHADER_NONE) |
4466
S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
4467
: V_02870C_SPI_SHADER_NONE) |
4468
S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
4469
: V_02870C_SPI_SHADER_NONE));
4470
4471
radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
4472
S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
4473
S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
4474
S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
4475
S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
4476
S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
4477
S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
4478
S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
4479
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
4480
cull_dist_mask << 8 | clip_dist_mask);
4481
4482
radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
4483
S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
4484
S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
4485
4486
radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4487
ngg_state->vgt_esgs_ring_itemsize);
4488
4489
/* NGG specific registers. */
4490
struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4491
uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
4492
4493
radeon_set_context_reg(
4494
ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4495
S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
4496
S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
4497
S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
4498
radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
4499
S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
4500
radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
4501
S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
4502
S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
4503
radeon_set_context_reg(
4504
ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4505
S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
4506
S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
4507
4508
/* User edge flags are set by the pos exports. If user edge flags are
4509
* not used, we must use hw-generated edge flags and pass them via
4510
* the prim export to prevent drawing lines on internal edges of
4511
* decomposed primitives (such as quads) with polygon mode = lines.
4512
*
4513
* TODO: We should combine hw-generated edge flags with user edge
4514
* flags in the shader.
4515
*/
4516
radeon_set_context_reg(
4517
ctx_cs, R_028838_PA_CL_NGG_CNTL,
4518
S_028838_INDEX_BUF_EDGE_FLAG_ENA(!radv_pipeline_has_tess(pipeline) &&
4519
!radv_pipeline_has_gs(pipeline)) |
4520
/* Reuse for NGG. */
4521
S_028838_VERTEX_REUSE_DEPTH(
4522
pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 30 : 0));
4523
4524
ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
4525
S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping ? ngg_state->hw_max_esverts : 256) | /* 256 = disable vertex grouping */
4526
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
4527
4528
/* Bug workaround for a possible hang with non-tessellation cases.
4529
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
4530
*
4531
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
4532
*/
4533
if (pipeline->device->physical_device->rad_info.chip_class == GFX10 &&
4534
!radv_pipeline_has_tess(pipeline) && ngg_state->hw_max_esverts != 256) {
4535
ge_cntl &= C_03096C_VERT_GRP_SIZE;
4536
4537
if (ngg_state->hw_max_esverts > 5) {
4538
ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
4539
}
4540
}
4541
4542
radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
4543
}
4544
4545
static void
4546
radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
4547
const struct radv_shader_variant *shader)
4548
{
4549
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
4550
4551
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4552
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4553
radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
4554
radeon_emit(cs, va >> 8);
4555
radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
4556
} else {
4557
radeon_set_sh_reg_seq(cs, R_00B410_SPI_SHADER_PGM_LO_LS, 2);
4558
radeon_emit(cs, va >> 8);
4559
radeon_emit(cs, S_00B414_MEM_BASE(va >> 40));
4560
}
4561
4562
radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
4563
radeon_emit(cs, shader->config.rsrc1);
4564
radeon_emit(cs, shader->config.rsrc2);
4565
} else {
4566
radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
4567
radeon_emit(cs, va >> 8);
4568
radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
4569
radeon_emit(cs, shader->config.rsrc1);
4570
radeon_emit(cs, shader->config.rsrc2);
4571
}
4572
}
4573
4574
static void
4575
radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4576
const struct radv_pipeline *pipeline)
4577
{
4578
struct radv_shader_variant *vs;
4579
4580
/* Skip shaders merged into HS/GS */
4581
vs = pipeline->shaders[MESA_SHADER_VERTEX];
4582
if (!vs)
4583
return;
4584
4585
if (vs->info.vs.as_ls)
4586
radv_pipeline_generate_hw_ls(cs, pipeline, vs);
4587
else if (vs->info.vs.as_es)
4588
radv_pipeline_generate_hw_es(cs, pipeline, vs);
4589
else if (vs->info.is_ngg)
4590
radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs);
4591
else
4592
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
4593
}
4594
4595
static void
4596
radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4597
const struct radv_pipeline *pipeline)
4598
{
4599
struct radv_shader_variant *tes, *tcs;
4600
4601
tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL];
4602
tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
4603
4604
if (tes) {
4605
if (tes->info.is_ngg) {
4606
radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes);
4607
} else if (tes->info.tes.as_es)
4608
radv_pipeline_generate_hw_es(cs, pipeline, tes);
4609
else
4610
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes);
4611
}
4612
4613
radv_pipeline_generate_hw_hs(cs, pipeline, tcs);
4614
4615
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
4616
!radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
4617
radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
4618
S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
4619
S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
4620
}
4621
}
4622
4623
static void
4624
radv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs,
4625
const struct radv_pipeline *pipeline,
4626
const VkGraphicsPipelineCreateInfo *pCreateInfo)
4627
{
4628
struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL);
4629
unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
4630
unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
4631
unsigned ls_hs_config;
4632
4633
num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
4634
num_tcs_output_cp =
4635
pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT
4636
num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
4637
4638
ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
4639
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
4640
4641
if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
4642
radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
4643
} else {
4644
radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
4645
}
4646
4647
switch (tes->info.tes.primitive_mode) {
4648
case GL_TRIANGLES:
4649
type = V_028B6C_TESS_TRIANGLE;
4650
break;
4651
case GL_QUADS:
4652
type = V_028B6C_TESS_QUAD;
4653
break;
4654
case GL_ISOLINES:
4655
type = V_028B6C_TESS_ISOLINE;
4656
break;
4657
}
4658
4659
switch (tes->info.tes.spacing) {
4660
case TESS_SPACING_EQUAL:
4661
partitioning = V_028B6C_PART_INTEGER;
4662
break;
4663
case TESS_SPACING_FRACTIONAL_ODD:
4664
partitioning = V_028B6C_PART_FRAC_ODD;
4665
break;
4666
case TESS_SPACING_FRACTIONAL_EVEN:
4667
partitioning = V_028B6C_PART_FRAC_EVEN;
4668
break;
4669
default:
4670
break;
4671
}
4672
4673
bool ccw = tes->info.tes.ccw;
4674
const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
4675
vk_find_struct_const(pCreateInfo->pTessellationState,
4676
PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
4677
4678
if (domain_origin_state &&
4679
domain_origin_state->domainOrigin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
4680
ccw = !ccw;
4681
4682
if (tes->info.tes.point_mode)
4683
topology = V_028B6C_OUTPUT_POINT;
4684
else if (tes->info.tes.primitive_mode == GL_ISOLINES)
4685
topology = V_028B6C_OUTPUT_LINE;
4686
else if (ccw)
4687
topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
4688
else
4689
topology = V_028B6C_OUTPUT_TRIANGLE_CW;
4690
4691
if (pipeline->device->physical_device->rad_info.has_distributed_tess) {
4692
if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
4693
pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
4694
distribution_mode = V_028B6C_TRAPEZOIDS;
4695
else
4696
distribution_mode = V_028B6C_DONUTS;
4697
} else
4698
distribution_mode = V_028B6C_NO_DIST;
4699
4700
radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
4701
S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
4702
S_028B6C_TOPOLOGY(topology) |
4703
S_028B6C_DISTRIBUTION_MODE(distribution_mode));
4704
}
4705
4706
static void
4707
radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4708
const struct radv_pipeline *pipeline,
4709
const struct radv_shader_variant *gs)
4710
{
4711
const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
4712
unsigned gs_max_out_vertices;
4713
const uint8_t *num_components;
4714
uint8_t max_stream;
4715
unsigned offset;
4716
uint64_t va;
4717
4718
gs_max_out_vertices = gs->info.gs.vertices_out;
4719
max_stream = gs->info.gs.max_stream;
4720
num_components = gs->info.gs.num_stream_output_components;
4721
4722
offset = num_components[0] * gs_max_out_vertices;
4723
4724
radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
4725
radeon_emit(ctx_cs, offset);
4726
if (max_stream >= 1)
4727
offset += num_components[1] * gs_max_out_vertices;
4728
radeon_emit(ctx_cs, offset);
4729
if (max_stream >= 2)
4730
offset += num_components[2] * gs_max_out_vertices;
4731
radeon_emit(ctx_cs, offset);
4732
if (max_stream >= 3)
4733
offset += num_components[3] * gs_max_out_vertices;
4734
radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
4735
4736
radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
4737
radeon_emit(ctx_cs, num_components[0]);
4738
radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
4739
radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
4740
radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
4741
4742
uint32_t gs_num_invocations = gs->info.gs.invocations;
4743
radeon_set_context_reg(
4744
ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
4745
S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
4746
4747
radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
4748
gs_state->vgt_esgs_ring_itemsize);
4749
4750
va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
4751
4752
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
4753
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
4754
radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
4755
radeon_emit(cs, va >> 8);
4756
radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
4757
} else {
4758
radeon_set_sh_reg_seq(cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
4759
radeon_emit(cs, va >> 8);
4760
radeon_emit(cs, S_00B214_MEM_BASE(va >> 40));
4761
}
4762
4763
radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
4764
radeon_emit(cs, gs->config.rsrc1);
4765
radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
4766
4767
radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
4768
radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
4769
gs_state->vgt_gs_max_prims_per_subgroup);
4770
} else {
4771
radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
4772
radeon_emit(cs, va >> 8);
4773
radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
4774
radeon_emit(cs, gs->config.rsrc1);
4775
radeon_emit(cs, gs->config.rsrc2);
4776
}
4777
4778
radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
4779
}
4780
4781
static void
4782
radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4783
const struct radv_pipeline *pipeline)
4784
{
4785
struct radv_shader_variant *gs;
4786
4787
gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
4788
if (!gs)
4789
return;
4790
4791
if (gs->info.is_ngg)
4792
radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs);
4793
else
4794
radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs);
4795
4796
radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
4797
}
4798
4799
static uint32_t
4800
offset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16)
4801
{
4802
uint32_t ps_input_cntl;
4803
if (offset <= AC_EXP_PARAM_OFFSET_31) {
4804
ps_input_cntl = S_028644_OFFSET(offset);
4805
if (flat_shade || explicit)
4806
ps_input_cntl |= S_028644_FLAT_SHADE(1);
4807
if (explicit) {
4808
/* Force parameter cache to be read in passthrough
4809
* mode.
4810
*/
4811
ps_input_cntl |= S_028644_OFFSET(1 << 5);
4812
}
4813
if (float16) {
4814
ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
4815
}
4816
} else {
4817
/* The input is a DEFAULT_VAL constant. */
4818
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
4819
offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
4820
ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
4821
}
4822
return ps_input_cntl;
4823
}
4824
4825
static void
4826
radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv_pipeline *pipeline)
4827
{
4828
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4829
const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
4830
uint32_t ps_input_cntl[32];
4831
4832
unsigned ps_offset = 0;
4833
4834
if (ps->info.ps.prim_id_input) {
4835
unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
4836
if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4837
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4838
++ps_offset;
4839
}
4840
}
4841
4842
if (ps->info.ps.layer_input || ps->info.needs_multiview_view_index) {
4843
unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
4844
if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4845
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4846
else
4847
ps_input_cntl[ps_offset] =
4848
offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4849
++ps_offset;
4850
}
4851
4852
if (ps->info.ps.viewport_index_input) {
4853
unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT];
4854
if (vs_offset != AC_EXP_PARAM_UNDEFINED)
4855
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
4856
else
4857
ps_input_cntl[ps_offset] =
4858
offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
4859
++ps_offset;
4860
}
4861
4862
if (ps->info.ps.has_pcoord) {
4863
unsigned val;
4864
val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
4865
ps_input_cntl[ps_offset] = val;
4866
ps_offset++;
4867
}
4868
4869
if (ps->info.ps.num_input_clips_culls) {
4870
unsigned vs_offset;
4871
4872
vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
4873
if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
4874
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4875
++ps_offset;
4876
}
4877
4878
vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
4879
if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.ps.num_input_clips_culls > 4) {
4880
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
4881
++ps_offset;
4882
}
4883
}
4884
4885
for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) {
4886
unsigned vs_offset;
4887
bool flat_shade;
4888
bool explicit;
4889
bool float16;
4890
if (!(ps->info.ps.input_mask & (1u << i)))
4891
continue;
4892
4893
vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
4894
if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
4895
ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20);
4896
++ps_offset;
4897
continue;
4898
}
4899
4900
flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset));
4901
explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset));
4902
float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset));
4903
4904
ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
4905
++ps_offset;
4906
}
4907
4908
if (ps_offset) {
4909
radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
4910
for (unsigned i = 0; i < ps_offset; i++) {
4911
radeon_emit(ctx_cs, ps_input_cntl[i]);
4912
}
4913
}
4914
}
4915
4916
static uint32_t
4917
radv_compute_db_shader_control(const struct radv_device *device,
4918
const struct radv_pipeline *pipeline,
4919
const struct radv_shader_variant *ps)
4920
{
4921
unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
4922
unsigned z_order;
4923
if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
4924
z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
4925
else
4926
z_order = V_02880C_LATE_Z;
4927
4928
if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
4929
conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
4930
else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
4931
conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
4932
4933
bool disable_rbplus = device->physical_device->rad_info.has_rbplus &&
4934
!device->physical_device->rad_info.rbplus_allowed;
4935
4936
/* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
4937
* but this appears to break Project Cars (DXVK). See
4938
* https://bugs.freedesktop.org/show_bug.cgi?id=109401
4939
*/
4940
bool mask_export_enable = ps->info.ps.writes_sample_mask;
4941
4942
return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
4943
S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
4944
S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
4945
S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
4946
S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
4947
S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
4948
S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
4949
S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
4950
S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
4951
S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
4952
}
4953
4954
static void
4955
radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
4956
struct radv_pipeline *pipeline)
4957
{
4958
struct radv_shader_variant *ps;
4959
uint64_t va;
4960
assert(pipeline->shaders[MESA_SHADER_FRAGMENT]);
4961
4962
ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
4963
va = radv_buffer_get_va(ps->bo) + ps->bo_offset;
4964
4965
radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
4966
radeon_emit(cs, va >> 8);
4967
radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
4968
radeon_emit(cs, ps->config.rsrc1);
4969
radeon_emit(cs, ps->config.rsrc2);
4970
4971
radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
4972
radv_compute_db_shader_control(pipeline->device, pipeline, ps));
4973
4974
radeon_set_context_reg(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, ps->config.spi_ps_input_ena);
4975
4976
radeon_set_context_reg(ctx_cs, R_0286D0_SPI_PS_INPUT_ADDR, ps->config.spi_ps_input_addr);
4977
4978
radeon_set_context_reg(
4979
ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
4980
S_0286D8_NUM_INTERP(ps->info.ps.num_interp) | S_0286D8_PS_W32_EN(ps->info.wave_size == 32));
4981
4982
radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
4983
4984
radeon_set_context_reg(
4985
ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
4986
ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil,
4987
ps->info.ps.writes_sample_mask));
4988
}
4989
4990
static void
4991
radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
4992
const struct radv_pipeline *pipeline)
4993
{
4994
if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10 ||
4995
pipeline->device->physical_device->rad_info.chip_class >= GFX10)
4996
return;
4997
4998
unsigned vtx_reuse_depth = 30;
4999
if (radv_pipeline_has_tess(pipeline) &&
5000
radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing ==
5001
TESS_SPACING_FRACTIONAL_ODD) {
5002
vtx_reuse_depth = 14;
5003
}
5004
radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
5005
S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
5006
}
5007
5008
static void
5009
radv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
5010
const struct radv_pipeline *pipeline)
5011
{
5012
uint32_t stages = 0;
5013
if (radv_pipeline_has_tess(pipeline)) {
5014
stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
5015
5016
if (radv_pipeline_has_gs(pipeline))
5017
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
5018
else if (radv_pipeline_has_ngg(pipeline))
5019
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
5020
else
5021
stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
5022
} else if (radv_pipeline_has_gs(pipeline)) {
5023
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
5024
} else if (radv_pipeline_has_ngg(pipeline)) {
5025
stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
5026
}
5027
5028
if (radv_pipeline_has_ngg(pipeline)) {
5029
stages |= S_028B54_PRIMGEN_EN(1);
5030
if (pipeline->streamout_shader)
5031
stages |= S_028B54_NGG_WAVE_ID_EN(1);
5032
if (radv_pipeline_has_ngg_passthrough(pipeline))
5033
stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
5034
} else if (radv_pipeline_has_gs(pipeline)) {
5035
stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
5036
}
5037
5038
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
5039
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
5040
5041
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
5042
uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
5043
5044
if (radv_pipeline_has_tess(pipeline))
5045
hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
5046
5047
if (pipeline->shaders[MESA_SHADER_GEOMETRY]) {
5048
vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
5049
if (radv_pipeline_has_gs_copy_shader(pipeline))
5050
vs_size = pipeline->gs_copy_shader->info.wave_size;
5051
} else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
5052
vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
5053
else if (pipeline->shaders[MESA_SHADER_VERTEX])
5054
vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size;
5055
5056
if (radv_pipeline_has_ngg(pipeline)) {
5057
assert(!radv_pipeline_has_gs_copy_shader(pipeline));
5058
gs_size = vs_size;
5059
}
5060
5061
/* legacy GS only supports Wave64 */
5062
stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
5063
S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
5064
S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
5065
}
5066
5067
radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
5068
}
5069
5070
static void
5071
radv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
5072
const VkGraphicsPipelineCreateInfo *pCreateInfo)
5073
{
5074
const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
5075
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
5076
uint32_t cliprect_rule = 0;
5077
5078
if (!discard_rectangle_info) {
5079
cliprect_rule = 0xffff;
5080
} else {
5081
for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
5082
/* Interpret i as a bitmask, and then set the bit in
5083
* the mask if that combination of rectangles in which
5084
* the pixel is contained should pass the cliprect
5085
* test.
5086
*/
5087
unsigned relevant_subset = i & ((1u << discard_rectangle_info->discardRectangleCount) - 1);
5088
5089
if (discard_rectangle_info->discardRectangleMode ==
5090
VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT &&
5091
!relevant_subset)
5092
continue;
5093
5094
if (discard_rectangle_info->discardRectangleMode ==
5095
VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT &&
5096
relevant_subset)
5097
continue;
5098
5099
cliprect_rule |= 1u << i;
5100
}
5101
}
5102
5103
radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
5104
}
5105
5106
static void
5107
gfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline)
5108
{
5109
bool break_wave_at_eoi = false;
5110
unsigned primgroup_size;
5111
unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
5112
5113
if (radv_pipeline_has_tess(pipeline)) {
5114
primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
5115
} else if (radv_pipeline_has_gs(pipeline)) {
5116
const struct gfx9_gs_info *gs_state =
5117
&pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
5118
unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
5119
primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
5120
} else {
5121
primgroup_size = 128; /* recommended without a GS and tess */
5122
}
5123
5124
if (radv_pipeline_has_tess(pipeline)) {
5125
if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
5126
radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
5127
break_wave_at_eoi = true;
5128
}
5129
5130
radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
5131
S_03096C_PRIM_GRP_SIZE(primgroup_size) |
5132
S_03096C_VERT_GRP_SIZE(vertgroup_size) |
5133
S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
5134
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
5135
}
5136
5137
static void
5138
radv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
5139
const struct radv_pipeline *pipeline,
5140
const VkGraphicsPipelineCreateInfo *pCreateInfo,
5141
const struct radv_graphics_pipeline_create_info *extra)
5142
{
5143
uint32_t gs_out;
5144
5145
if (radv_pipeline_has_gs(pipeline)) {
5146
gs_out =
5147
si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
5148
} else if (radv_pipeline_has_tess(pipeline)) {
5149
if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
5150
gs_out = V_028A6C_POINTLIST;
5151
} else {
5152
gs_out = si_conv_gl_prim_to_gs_out(
5153
pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.primitive_mode);
5154
}
5155
} else {
5156
gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology);
5157
}
5158
5159
if (extra && extra->use_rectlist) {
5160
gs_out = V_028A6C_TRISTRIP;
5161
if (radv_pipeline_has_ngg(pipeline))
5162
gs_out = V_028A6C_RECTLIST;
5163
}
5164
5165
radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
5166
}
5167
5168
static bool
5169
gfx103_pipeline_vrs_coarse_shading(const struct radv_pipeline *pipeline)
5170
{
5171
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5172
struct radv_device *device = pipeline->device;
5173
5174
if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING)
5175
return false;
5176
5177
if (!ps->info.ps.allow_flat_shading)
5178
return false;
5179
5180
return true;
5181
}
5182
5183
static void
5184
gfx103_pipeline_generate_vrs_state(struct radeon_cmdbuf *ctx_cs,
5185
const struct radv_pipeline *pipeline,
5186
const VkGraphicsPipelineCreateInfo *pCreateInfo)
5187
{
5188
uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU;
5189
uint8_t rate_x = 0, rate_y = 0;
5190
bool enable_vrs = false;
5191
5192
if (vk_find_struct_const(pCreateInfo->pNext,
5193
PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) ||
5194
radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR)) {
5195
/* Enable draw call VRS because it's explicitly requested. */
5196
enable_vrs = true;
5197
} else if (gfx103_pipeline_vrs_coarse_shading(pipeline)) {
5198
/* Enable VRS coarse shading 2x2 if the driver determined that
5199
* it's safe to enable.
5200
*/
5201
mode = V_028064_VRS_COMB_MODE_OVERRIDE;
5202
rate_x = rate_y = 1;
5203
} else if (pipeline->device->force_vrs != RADV_FORCE_VRS_NONE) {
5204
/* Force enable vertex VRS if requested by the user. */
5205
radeon_set_context_reg(
5206
ctx_cs, R_028848_PA_CL_VRS_CNTL,
5207
S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) |
5208
S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
5209
5210
/* If the shader is using discard, turn off coarse shading
5211
* because discard at 2x2 pixel granularity degrades quality
5212
* too much. MIN allows sample shading but not coarse shading.
5213
*/
5214
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5215
5216
mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU;
5217
}
5218
5219
radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, S_028A98_EN_VRS_RATE(enable_vrs));
5220
5221
radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL,
5222
S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
5223
S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
5224
S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
5225
}
5226
5227
static void
5228
radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
5229
const VkGraphicsPipelineCreateInfo *pCreateInfo,
5230
const struct radv_graphics_pipeline_create_info *extra,
5231
const struct radv_blend_state *blend)
5232
{
5233
struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
5234
struct radeon_cmdbuf *cs = &pipeline->cs;
5235
5236
cs->max_dw = 64;
5237
ctx_cs->max_dw = 256;
5238
cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
5239
ctx_cs->buf = cs->buf + cs->max_dw;
5240
5241
radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra);
5242
radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend);
5243
radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
5244
radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
5245
radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
5246
radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline);
5247
5248
if (radv_pipeline_has_tess(pipeline)) {
5249
radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline);
5250
radv_pipeline_generate_tess_state(ctx_cs, pipeline, pCreateInfo);
5251
}
5252
5253
radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline);
5254
radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
5255
radv_pipeline_generate_ps_inputs(ctx_cs, pipeline);
5256
radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
5257
radv_pipeline_generate_vgt_shader_config(ctx_cs, pipeline);
5258
radv_pipeline_generate_cliprect_rule(ctx_cs, pCreateInfo);
5259
radv_pipeline_generate_vgt_gs_out(ctx_cs, pipeline, pCreateInfo, extra);
5260
5261
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
5262
!radv_pipeline_has_ngg(pipeline))
5263
gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline);
5264
5265
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
5266
gfx103_pipeline_generate_vrs_state(ctx_cs, pipeline, pCreateInfo);
5267
5268
pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
5269
5270
assert(ctx_cs->cdw <= ctx_cs->max_dw);
5271
assert(cs->cdw <= cs->max_dw);
5272
}
5273
5274
static void
5275
radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
5276
const VkGraphicsPipelineCreateInfo *pCreateInfo)
5277
{
5278
const struct radv_shader_info *info = &radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info;
5279
const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState;
5280
5281
for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
5282
const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i];
5283
5284
pipeline->binding_stride[desc->binding] = desc->stride;
5285
}
5286
5287
for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
5288
const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i];
5289
5290
uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
5291
pipeline->attrib_ends[desc->location] = end;
5292
if (pipeline->binding_stride[desc->binding])
5293
pipeline->attrib_index_offset[desc->location] =
5294
desc->offset / pipeline->binding_stride[desc->binding];
5295
pipeline->attrib_bindings[desc->location] = desc->binding;
5296
}
5297
5298
pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
5299
pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
5300
pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
5301
}
5302
5303
static struct radv_shader_variant *
5304
radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
5305
{
5306
int i;
5307
5308
for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
5309
struct radv_shader_variant *shader = radv_get_shader(pipeline, i);
5310
5311
if (shader && shader->info.so.num_outputs > 0)
5312
return shader;
5313
}
5314
5315
return NULL;
5316
}
5317
5318
static void
5319
radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
5320
{
5321
struct radv_device *device = pipeline->device;
5322
5323
for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
5324
pipeline->user_data_0[i] = radv_pipeline_stage_to_user_data_0(
5325
pipeline, i, device->physical_device->rad_info.chip_class);
5326
5327
if (pipeline->shaders[i]) {
5328
pipeline->need_indirect_descriptor_sets |=
5329
pipeline->shaders[i]->info.need_indirect_descriptor_sets;
5330
}
5331
}
5332
5333
struct radv_userdata_info *loc =
5334
radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
5335
if (loc->sgpr_idx != -1) {
5336
pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX];
5337
pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
5338
pipeline->graphics.vtx_emit_num = loc->num_sgprs;
5339
pipeline->graphics.uses_drawid =
5340
radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
5341
pipeline->graphics.uses_baseinstance =
5342
radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_base_instance;
5343
}
5344
}
5345
5346
static VkResult
5347
radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
5348
struct radv_pipeline_cache *cache,
5349
const VkGraphicsPipelineCreateInfo *pCreateInfo,
5350
const struct radv_graphics_pipeline_create_info *extra)
5351
{
5352
VkResult result;
5353
5354
pipeline->device = device;
5355
pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
5356
pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
5357
assert(pipeline->layout);
5358
5359
struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
5360
5361
const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
5362
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
5363
radv_init_feedback(creation_feedback);
5364
5365
VkPipelineCreationFeedbackEXT *pipeline_feedback =
5366
creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
5367
5368
const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
5369
0,
5370
};
5371
VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
5372
for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
5373
gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1;
5374
pStages[stage] = &pCreateInfo->pStages[i];
5375
if (creation_feedback)
5376
stage_feedbacks[stage] = &creation_feedback->pPipelineStageCreationFeedbacks[i];
5377
}
5378
5379
struct radv_pipeline_key key =
5380
radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);
5381
5382
result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
5383
pipeline_feedback, stage_feedbacks);
5384
if (result != VK_SUCCESS)
5385
return result;
5386
5387
pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
5388
radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
5389
radv_pipeline_init_input_assembly_state(pipeline, pCreateInfo, extra);
5390
radv_pipeline_init_dynamic_state(pipeline, pCreateInfo, extra);
5391
radv_pipeline_init_raster_state(pipeline, pCreateInfo);
5392
radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo);
5393
5394
if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
5395
gfx103_pipeline_init_vrs_state(pipeline, pCreateInfo);
5396
5397
/* Ensure that some export memory is always allocated, for two reasons:
5398
*
5399
* 1) Correctness: The hardware ignores the EXEC mask if no export
5400
* memory is allocated, so KILL and alpha test do not work correctly
5401
* without this.
5402
* 2) Performance: Every shader needs at least a NULL export, even when
5403
* it writes no color/depth output. The NULL export instruction
5404
* stalls without this setting.
5405
*
5406
* Don't add this to CB_SHADER_MASK.
5407
*
5408
* GFX10 supports pixel shaders without exports by setting both the
5409
* color and Z formats to SPI_SHADER_ZERO. The hw will skip export
5410
* instructions if any are present.
5411
*/
5412
struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
5413
if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 ||
5414
ps->info.ps.can_discard) &&
5415
!blend.spi_shader_col_format) {
5416
if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)
5417
blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
5418
}
5419
5420
if (extra && (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
5421
extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
5422
extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS ||
5423
extra->custom_blend_mode == V_028808_CB_RESOLVE)) {
5424
/* According to the CB spec states, CB_SHADER_MASK should be
5425
* set to enable writes to all four channels of MRT0.
5426
*/
5427
blend.cb_shader_mask = 0xf;
5428
}
5429
5430
pipeline->graphics.col_format = blend.spi_shader_col_format;
5431
pipeline->graphics.cb_target_mask = blend.cb_target_mask;
5432
5433
if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
5434
struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
5435
5436
radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
5437
}
5438
5439
if (radv_pipeline_has_tess(pipeline)) {
5440
pipeline->graphics.tess_patch_control_points =
5441
pCreateInfo->pTessellationState->patchControlPoints;
5442
}
5443
5444
radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo);
5445
radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend);
5446
radv_pipeline_init_shader_stages_state(pipeline);
5447
radv_pipeline_init_scratch(device, pipeline);
5448
5449
/* Find the last vertex shader stage that eventually uses streamout. */
5450
pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
5451
5452
pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline);
5453
pipeline->graphics.has_ngg_culling =
5454
pipeline->graphics.is_ngg &&
5455
pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;
5456
5457
radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
5458
5459
return result;
5460
}
5461
5462
VkResult
5463
radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
5464
const VkGraphicsPipelineCreateInfo *pCreateInfo,
5465
const struct radv_graphics_pipeline_create_info *extra,
5466
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
5467
{
5468
RADV_FROM_HANDLE(radv_device, device, _device);
5469
RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
5470
struct radv_pipeline *pipeline;
5471
VkResult result;
5472
5473
pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
5474
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5475
if (pipeline == NULL)
5476
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
5477
5478
vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
5479
5480
result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
5481
if (result != VK_SUCCESS) {
5482
radv_pipeline_destroy(device, pipeline, pAllocator);
5483
return result;
5484
}
5485
5486
*pPipeline = radv_pipeline_to_handle(pipeline);
5487
5488
return VK_SUCCESS;
5489
}
5490
5491
VkResult
5492
radv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
5493
const VkGraphicsPipelineCreateInfo *pCreateInfos,
5494
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
5495
{
5496
VkResult result = VK_SUCCESS;
5497
unsigned i = 0;
5498
5499
for (; i < count; i++) {
5500
VkResult r;
5501
r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator,
5502
&pPipelines[i]);
5503
if (r != VK_SUCCESS) {
5504
result = r;
5505
pPipelines[i] = VK_NULL_HANDLE;
5506
5507
if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5508
break;
5509
}
5510
}
5511
5512
for (; i < count; ++i)
5513
pPipelines[i] = VK_NULL_HANDLE;
5514
5515
return result;
5516
}
5517
5518
static void
5519
radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
5520
{
5521
struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5522
uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
5523
struct radv_device *device = pipeline->device;
5524
5525
radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
5526
radeon_emit(cs, va >> 8);
5527
radeon_emit(cs, S_00B834_DATA(va >> 40));
5528
5529
radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
5530
radeon_emit(cs, shader->config.rsrc1);
5531
radeon_emit(cs, shader->config.rsrc2);
5532
if (device->physical_device->rad_info.chip_class >= GFX10) {
5533
radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
5534
}
5535
}
5536
5537
static void
5538
radv_pipeline_generate_compute_state(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
5539
{
5540
struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
5541
struct radv_device *device = pipeline->device;
5542
unsigned threads_per_threadgroup;
5543
unsigned threadgroups_per_cu = 1;
5544
unsigned waves_per_threadgroup;
5545
unsigned max_waves_per_sh = 0;
5546
5547
/* Calculate best compute resource limits. */
5548
threads_per_threadgroup =
5549
shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
5550
waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
5551
5552
if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1)
5553
threadgroups_per_cu = 2;
5554
5555
radeon_set_sh_reg(
5556
cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
5557
ac_get_compute_resource_limits(&device->physical_device->rad_info, waves_per_threadgroup,
5558
max_waves_per_sh, threadgroups_per_cu));
5559
5560
radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
5561
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
5562
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
5563
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
5564
}
5565
5566
static void
5567
radv_compute_generate_pm4(struct radv_pipeline *pipeline)
5568
{
5569
struct radv_device *device = pipeline->device;
5570
struct radeon_cmdbuf *cs = &pipeline->cs;
5571
5572
cs->max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 19 : 16;
5573
cs->buf = malloc(cs->max_dw * 4);
5574
5575
radv_pipeline_generate_hw_cs(cs, pipeline);
5576
radv_pipeline_generate_compute_state(cs, pipeline);
5577
5578
assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
5579
}
5580
5581
static struct radv_pipeline_key
5582
radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
5583
const VkComputePipelineCreateInfo *pCreateInfo)
5584
{
5585
const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
5586
struct radv_pipeline_key key;
5587
memset(&key, 0, sizeof(key));
5588
5589
if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
5590
key.optimisations_disabled = 1;
5591
5592
const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size =
5593
vk_find_struct_const(stage->pNext,
5594
PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
5595
5596
if (subgroup_size) {
5597
assert(subgroup_size->requiredSubgroupSize == 32 ||
5598
subgroup_size->requiredSubgroupSize == 64);
5599
key.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
5600
} else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
5601
key.require_full_subgroups = true;
5602
}
5603
5604
return key;
5605
}
5606
5607
static VkResult
5608
radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
5609
const VkComputePipelineCreateInfo *pCreateInfo,
5610
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
5611
{
5612
RADV_FROM_HANDLE(radv_device, device, _device);
5613
RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
5614
const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
5615
0,
5616
};
5617
VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
5618
struct radv_pipeline *pipeline;
5619
VkResult result;
5620
5621
pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
5622
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5623
if (pipeline == NULL)
5624
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
5625
5626
vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
5627
5628
pipeline->device = device;
5629
pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
5630
pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
5631
assert(pipeline->layout);
5632
5633
const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
5634
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
5635
radv_init_feedback(creation_feedback);
5636
5637
VkPipelineCreationFeedbackEXT *pipeline_feedback =
5638
creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
5639
if (creation_feedback)
5640
stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0];
5641
5642
pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
5643
5644
struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
5645
5646
result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
5647
pipeline_feedback, stage_feedbacks);
5648
if (result != VK_SUCCESS) {
5649
radv_pipeline_destroy(device, pipeline, pAllocator);
5650
return result;
5651
}
5652
5653
pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(
5654
pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
5655
pipeline->need_indirect_descriptor_sets |=
5656
pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
5657
radv_pipeline_init_scratch(device, pipeline);
5658
5659
radv_compute_generate_pm4(pipeline);
5660
5661
*pPipeline = radv_pipeline_to_handle(pipeline);
5662
5663
return VK_SUCCESS;
5664
}
5665
5666
VkResult
5667
radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
5668
const VkComputePipelineCreateInfo *pCreateInfos,
5669
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
5670
{
5671
VkResult result = VK_SUCCESS;
5672
5673
unsigned i = 0;
5674
for (; i < count; i++) {
5675
VkResult r;
5676
r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator,
5677
&pPipelines[i]);
5678
if (r != VK_SUCCESS) {
5679
result = r;
5680
pPipelines[i] = VK_NULL_HANDLE;
5681
5682
if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
5683
break;
5684
}
5685
}
5686
5687
for (; i < count; ++i)
5688
pPipelines[i] = VK_NULL_HANDLE;
5689
5690
return result;
5691
}
5692
5693
static uint32_t
5694
radv_get_executable_count(const struct radv_pipeline *pipeline)
5695
{
5696
uint32_t ret = 0;
5697
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5698
if (!pipeline->shaders[i])
5699
continue;
5700
5701
if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5702
ret += 2u;
5703
} else {
5704
ret += 1u;
5705
}
5706
}
5707
return ret;
5708
}
5709
5710
static struct radv_shader_variant *
5711
radv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index,
5712
gl_shader_stage *stage)
5713
{
5714
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
5715
if (!pipeline->shaders[i])
5716
continue;
5717
if (!index) {
5718
*stage = i;
5719
return pipeline->shaders[i];
5720
}
5721
5722
--index;
5723
5724
if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5725
if (!index) {
5726
*stage = i;
5727
return pipeline->gs_copy_shader;
5728
}
5729
--index;
5730
}
5731
}
5732
5733
*stage = -1;
5734
return NULL;
5735
}
5736
5737
/* Basically strlcpy (which does not exist on linux) specialized for
5738
* descriptions. */
5739
static void
5740
desc_copy(char *desc, const char *src)
5741
{
5742
int len = strlen(src);
5743
assert(len < VK_MAX_DESCRIPTION_SIZE);
5744
memcpy(desc, src, len);
5745
memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
5746
}
5747
5748
VkResult
5749
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
5750
uint32_t *pExecutableCount,
5751
VkPipelineExecutablePropertiesKHR *pProperties)
5752
{
5753
RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
5754
const uint32_t total_count = radv_get_executable_count(pipeline);
5755
5756
if (!pProperties) {
5757
*pExecutableCount = total_count;
5758
return VK_SUCCESS;
5759
}
5760
5761
const uint32_t count = MIN2(total_count, *pExecutableCount);
5762
for (unsigned i = 0, executable_idx = 0; i < MESA_SHADER_STAGES && executable_idx < count; ++i) {
5763
if (!pipeline->shaders[i])
5764
continue;
5765
pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
5766
const char *name = NULL;
5767
const char *description = NULL;
5768
switch (i) {
5769
case MESA_SHADER_VERTEX:
5770
name = "Vertex Shader";
5771
description = "Vulkan Vertex Shader";
5772
break;
5773
case MESA_SHADER_TESS_CTRL:
5774
if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
5775
pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5776
name = "Vertex + Tessellation Control Shaders";
5777
description = "Combined Vulkan Vertex and Tessellation Control Shaders";
5778
} else {
5779
name = "Tessellation Control Shader";
5780
description = "Vulkan Tessellation Control Shader";
5781
}
5782
break;
5783
case MESA_SHADER_TESS_EVAL:
5784
name = "Tessellation Evaluation Shader";
5785
description = "Vulkan Tessellation Evaluation Shader";
5786
break;
5787
case MESA_SHADER_GEOMETRY:
5788
if (radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
5789
pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
5790
name = "Tessellation Evaluation + Geometry Shaders";
5791
description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
5792
} else if (!radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_VERTEX]) {
5793
pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
5794
name = "Vertex + Geometry Shader";
5795
description = "Combined Vulkan Vertex and Geometry Shaders";
5796
} else {
5797
name = "Geometry Shader";
5798
description = "Vulkan Geometry Shader";
5799
}
5800
break;
5801
case MESA_SHADER_FRAGMENT:
5802
name = "Fragment Shader";
5803
description = "Vulkan Fragment Shader";
5804
break;
5805
case MESA_SHADER_COMPUTE:
5806
name = "Compute Shader";
5807
description = "Vulkan Compute Shader";
5808
break;
5809
}
5810
5811
pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
5812
desc_copy(pProperties[executable_idx].name, name);
5813
desc_copy(pProperties[executable_idx].description, description);
5814
5815
++executable_idx;
5816
if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
5817
assert(pipeline->gs_copy_shader);
5818
if (executable_idx >= count)
5819
break;
5820
5821
pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
5822
pProperties[executable_idx].subgroupSize = 64;
5823
desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
5824
desc_copy(pProperties[executable_idx].description,
5825
"Extra shader stage that loads the GS output ringbuffer into the rasterizer");
5826
5827
++executable_idx;
5828
}
5829
}
5830
5831
VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
5832
*pExecutableCount = count;
5833
return result;
5834
}
5835
5836
VkResult
5837
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
5838
const VkPipelineExecutableInfoKHR *pExecutableInfo,
5839
uint32_t *pStatisticCount,
5840
VkPipelineExecutableStatisticKHR *pStatistics)
5841
{
5842
RADV_FROM_HANDLE(radv_device, device, _device);
5843
RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
5844
gl_shader_stage stage;
5845
struct radv_shader_variant *shader =
5846
radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
5847
5848
enum chip_class chip_class = device->physical_device->rad_info.chip_class;
5849
unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
5850
unsigned max_waves = radv_get_max_waves(device, shader, stage);
5851
5852
VkPipelineExecutableStatisticKHR *s = pStatistics;
5853
VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
5854
VkResult result = VK_SUCCESS;
5855
5856
if (s < end) {
5857
desc_copy(s->name, "SGPRs");
5858
desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
5859
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5860
s->value.u64 = shader->config.num_sgprs;
5861
}
5862
++s;
5863
5864
if (s < end) {
5865
desc_copy(s->name, "VGPRs");
5866
desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
5867
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5868
s->value.u64 = shader->config.num_vgprs;
5869
}
5870
++s;
5871
5872
if (s < end) {
5873
desc_copy(s->name, "Spilled SGPRs");
5874
desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
5875
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5876
s->value.u64 = shader->config.spilled_sgprs;
5877
}
5878
++s;
5879
5880
if (s < end) {
5881
desc_copy(s->name, "Spilled VGPRs");
5882
desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
5883
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5884
s->value.u64 = shader->config.spilled_vgprs;
5885
}
5886
++s;
5887
5888
if (s < end) {
5889
desc_copy(s->name, "PrivMem VGPRs");
5890
desc_copy(s->description, "Number of VGPRs stored in private memory per subgroup");
5891
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5892
s->value.u64 = shader->info.private_mem_vgprs;
5893
}
5894
++s;
5895
5896
if (s < end) {
5897
desc_copy(s->name, "Code size");
5898
desc_copy(s->description, "Code size in bytes");
5899
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5900
s->value.u64 = shader->exec_size;
5901
}
5902
++s;
5903
5904
if (s < end) {
5905
desc_copy(s->name, "LDS size");
5906
desc_copy(s->description, "LDS size in bytes per workgroup");
5907
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5908
s->value.u64 = shader->config.lds_size * lds_increment;
5909
}
5910
++s;
5911
5912
if (s < end) {
5913
desc_copy(s->name, "Scratch size");
5914
desc_copy(s->description, "Private memory in bytes per subgroup");
5915
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5916
s->value.u64 = shader->config.scratch_bytes_per_wave;
5917
}
5918
++s;
5919
5920
if (s < end) {
5921
desc_copy(s->name, "Subgroups per SIMD");
5922
desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
5923
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5924
s->value.u64 = max_waves;
5925
}
5926
++s;
5927
5928
if (shader->statistics) {
5929
for (unsigned i = 0; i < aco_num_statistics; i++) {
5930
const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
5931
if (s < end) {
5932
desc_copy(s->name, info->name);
5933
desc_copy(s->description, info->desc);
5934
s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
5935
s->value.u64 = shader->statistics[i];
5936
}
5937
++s;
5938
}
5939
}
5940
5941
if (!pStatistics)
5942
*pStatisticCount = s - pStatistics;
5943
else if (s > end) {
5944
*pStatisticCount = end - pStatistics;
5945
result = VK_INCOMPLETE;
5946
} else {
5947
*pStatisticCount = s - pStatistics;
5948
}
5949
5950
return result;
5951
}
5952
5953
static VkResult
5954
radv_copy_representation(void *data, size_t *data_size, const char *src)
5955
{
5956
size_t total_size = strlen(src) + 1;
5957
5958
if (!data) {
5959
*data_size = total_size;
5960
return VK_SUCCESS;
5961
}
5962
5963
size_t size = MIN2(total_size, *data_size);
5964
5965
memcpy(data, src, size);
5966
if (size)
5967
*((char *)data + size - 1) = 0;
5968
return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
5969
}
5970
5971
VkResult
5972
radv_GetPipelineExecutableInternalRepresentationsKHR(
5973
VkDevice device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
5974
uint32_t *pInternalRepresentationCount,
5975
VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
5976
{
5977
RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
5978
gl_shader_stage stage;
5979
struct radv_shader_variant *shader =
5980
radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
5981
5982
VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
5983
VkPipelineExecutableInternalRepresentationKHR *end =
5984
p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
5985
VkResult result = VK_SUCCESS;
5986
/* optimized NIR */
5987
if (p < end) {
5988
p->isText = true;
5989
desc_copy(p->name, "NIR Shader(s)");
5990
desc_copy(p->description, "The optimized NIR shader(s)");
5991
if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
5992
result = VK_INCOMPLETE;
5993
}
5994
++p;
5995
5996
/* backend IR */
5997
if (p < end) {
5998
p->isText = true;
5999
if (radv_use_llvm_for_stage(pipeline->device, stage)) {
6000
desc_copy(p->name, "LLVM IR");
6001
desc_copy(p->description, "The LLVM IR after some optimizations");
6002
} else {
6003
desc_copy(p->name, "ACO IR");
6004
desc_copy(p->description, "The ACO IR after some optimizations");
6005
}
6006
if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
6007
result = VK_INCOMPLETE;
6008
}
6009
++p;
6010
6011
/* Disassembler */
6012
if (p < end) {
6013
p->isText = true;
6014
desc_copy(p->name, "Assembly");
6015
desc_copy(p->description, "Final Assembly");
6016
if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
6017
result = VK_INCOMPLETE;
6018
}
6019
++p;
6020
6021
if (!pInternalRepresentations)
6022
*pInternalRepresentationCount = p - pInternalRepresentations;
6023
else if (p > end) {
6024
result = VK_INCOMPLETE;
6025
*pInternalRepresentationCount = end - pInternalRepresentations;
6026
} else {
6027
*pInternalRepresentationCount = p - pInternalRepresentations;
6028
}
6029
6030
return result;
6031
}
6032
6033