Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/vulkan/radv_cmd_buffer.c
7102 views
1
/*
2
* Copyright © 2016 Red Hat.
3
* Copyright © 2016 Bas Nieuwenhuizen
4
*
5
* based in part on anv driver which is:
6
* Copyright © 2015 Intel Corporation
7
*
8
* Permission is hereby granted, free of charge, to any person obtaining a
9
* copy of this software and associated documentation files (the "Software"),
10
* to deal in the Software without restriction, including without limitation
11
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
* and/or sell copies of the Software, and to permit persons to whom the
13
* Software is furnished to do so, subject to the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the next
16
* paragraph) shall be included in all copies or substantial portions of the
17
* Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
* IN THE SOFTWARE.
26
*/
27
28
#include "radv_cs.h"
29
#include "radv_debug.h"
30
#include "radv_meta.h"
31
#include "radv_private.h"
32
#include "radv_radeon_winsys.h"
33
#include "radv_shader.h"
34
#include "sid.h"
35
#include "vk_format.h"
36
#include "vk_util.h"
37
38
#include "ac_debug.h"
39
40
enum {
41
RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
42
RADV_PREFETCH_VS = (1 << 1),
43
RADV_PREFETCH_TCS = (1 << 2),
44
RADV_PREFETCH_TES = (1 << 3),
45
RADV_PREFETCH_GS = (1 << 4),
46
RADV_PREFETCH_PS = (1 << 5),
47
RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
48
RADV_PREFETCH_GS | RADV_PREFETCH_PS)
49
};
50
51
enum {
52
RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
53
VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
54
VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR)
55
};
56
57
static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
58
struct radv_image *image, VkImageLayout src_layout,
59
bool src_render_loop, VkImageLayout dst_layout,
60
bool dst_render_loop, uint32_t src_family,
61
uint32_t dst_family, const VkImageSubresourceRange *range,
62
struct radv_sample_locations_state *sample_locs);
63
64
const struct radv_dynamic_state default_dynamic_state = {
65
.viewport =
66
{
67
.count = 0,
68
},
69
.scissor =
70
{
71
.count = 0,
72
},
73
.line_width = 1.0f,
74
.depth_bias =
75
{
76
.bias = 0.0f,
77
.clamp = 0.0f,
78
.slope = 0.0f,
79
},
80
.blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
81
.depth_bounds =
82
{
83
.min = 0.0f,
84
.max = 1.0f,
85
},
86
.stencil_compare_mask =
87
{
88
.front = ~0u,
89
.back = ~0u,
90
},
91
.stencil_write_mask =
92
{
93
.front = ~0u,
94
.back = ~0u,
95
},
96
.stencil_reference =
97
{
98
.front = 0u,
99
.back = 0u,
100
},
101
.line_stipple =
102
{
103
.factor = 0u,
104
.pattern = 0u,
105
},
106
.cull_mode = 0u,
107
.front_face = 0u,
108
.primitive_topology = 0u,
109
.fragment_shading_rate =
110
{
111
.size = {1u, 1u},
112
.combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
113
VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
114
},
115
.depth_bias_enable = 0u,
116
.primitive_restart_enable = 0u,
117
.rasterizer_discard_enable = 0u,
118
.logic_op = 0u,
119
.color_write_enable = 0xffffffffu,
120
};
121
122
static void
123
radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
124
{
125
struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
126
uint64_t copy_mask = src->mask;
127
uint64_t dest_mask = 0;
128
129
dest->discard_rectangle.count = src->discard_rectangle.count;
130
dest->sample_location.count = src->sample_location.count;
131
132
if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
133
if (dest->viewport.count != src->viewport.count) {
134
dest->viewport.count = src->viewport.count;
135
dest_mask |= RADV_DYNAMIC_VIEWPORT;
136
}
137
138
if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
139
src->viewport.count * sizeof(VkViewport))) {
140
typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
141
dest_mask |= RADV_DYNAMIC_VIEWPORT;
142
}
143
}
144
145
if (copy_mask & RADV_DYNAMIC_SCISSOR) {
146
if (dest->scissor.count != src->scissor.count) {
147
dest->scissor.count = src->scissor.count;
148
dest_mask |= RADV_DYNAMIC_SCISSOR;
149
}
150
151
if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
152
src->scissor.count * sizeof(VkRect2D))) {
153
typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
154
dest_mask |= RADV_DYNAMIC_SCISSOR;
155
}
156
}
157
158
if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
159
if (dest->line_width != src->line_width) {
160
dest->line_width = src->line_width;
161
dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
162
}
163
}
164
165
if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
166
if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
167
dest->depth_bias = src->depth_bias;
168
dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
169
}
170
}
171
172
if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
173
if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
174
typed_memcpy(dest->blend_constants, src->blend_constants, 4);
175
dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
176
}
177
}
178
179
if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
180
if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
181
dest->depth_bounds = src->depth_bounds;
182
dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
183
}
184
}
185
186
if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
187
if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
188
sizeof(src->stencil_compare_mask))) {
189
dest->stencil_compare_mask = src->stencil_compare_mask;
190
dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
191
}
192
}
193
194
if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
195
if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
196
sizeof(src->stencil_write_mask))) {
197
dest->stencil_write_mask = src->stencil_write_mask;
198
dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
199
}
200
}
201
202
if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
203
if (memcmp(&dest->stencil_reference, &src->stencil_reference,
204
sizeof(src->stencil_reference))) {
205
dest->stencil_reference = src->stencil_reference;
206
dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
207
}
208
}
209
210
if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
211
if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
212
src->discard_rectangle.count * sizeof(VkRect2D))) {
213
typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
214
src->discard_rectangle.count);
215
dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
216
}
217
}
218
219
if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
220
if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
221
dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
222
dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
223
memcmp(&dest->sample_location.locations, &src->sample_location.locations,
224
src->sample_location.count * sizeof(VkSampleLocationEXT))) {
225
dest->sample_location.per_pixel = src->sample_location.per_pixel;
226
dest->sample_location.grid_size = src->sample_location.grid_size;
227
typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
228
src->sample_location.count);
229
dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
230
}
231
}
232
233
if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
234
if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
235
dest->line_stipple = src->line_stipple;
236
dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
237
}
238
}
239
240
if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
241
if (dest->cull_mode != src->cull_mode) {
242
dest->cull_mode = src->cull_mode;
243
dest_mask |= RADV_DYNAMIC_CULL_MODE;
244
}
245
}
246
247
if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
248
if (dest->front_face != src->front_face) {
249
dest->front_face = src->front_face;
250
dest_mask |= RADV_DYNAMIC_FRONT_FACE;
251
}
252
}
253
254
if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
255
if (dest->primitive_topology != src->primitive_topology) {
256
dest->primitive_topology = src->primitive_topology;
257
dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
258
}
259
}
260
261
if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
262
if (dest->depth_test_enable != src->depth_test_enable) {
263
dest->depth_test_enable = src->depth_test_enable;
264
dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
265
}
266
}
267
268
if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
269
if (dest->depth_write_enable != src->depth_write_enable) {
270
dest->depth_write_enable = src->depth_write_enable;
271
dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
272
}
273
}
274
275
if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
276
if (dest->depth_compare_op != src->depth_compare_op) {
277
dest->depth_compare_op = src->depth_compare_op;
278
dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
279
}
280
}
281
282
if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
283
if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
284
dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
285
dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
286
}
287
}
288
289
if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
290
if (dest->stencil_test_enable != src->stencil_test_enable) {
291
dest->stencil_test_enable = src->stencil_test_enable;
292
dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
293
}
294
}
295
296
if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
297
if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
298
dest->stencil_op = src->stencil_op;
299
dest_mask |= RADV_DYNAMIC_STENCIL_OP;
300
}
301
}
302
303
if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
304
if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
305
sizeof(src->fragment_shading_rate))) {
306
dest->fragment_shading_rate = src->fragment_shading_rate;
307
dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
308
}
309
}
310
311
if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
312
if (dest->depth_bias_enable != src->depth_bias_enable) {
313
dest->depth_bias_enable = src->depth_bias_enable;
314
dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
315
}
316
}
317
318
if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
319
if (dest->primitive_restart_enable != src->primitive_restart_enable) {
320
dest->primitive_restart_enable = src->primitive_restart_enable;
321
dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
322
}
323
}
324
325
if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
326
if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
327
dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
328
dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
329
}
330
}
331
332
if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
333
if (dest->logic_op != src->logic_op) {
334
dest->logic_op = src->logic_op;
335
dest_mask |= RADV_DYNAMIC_LOGIC_OP;
336
}
337
}
338
339
if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
340
if (dest->color_write_enable != src->color_write_enable) {
341
dest->color_write_enable = src->color_write_enable;
342
dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
343
}
344
}
345
346
cmd_buffer->state.dirty |= dest_mask;
347
}
348
349
static void
350
radv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
351
{
352
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
353
struct radv_shader_info *info;
354
355
if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout)
356
return;
357
358
info = &pipeline->streamout_shader->info;
359
for (int i = 0; i < MAX_SO_BUFFERS; i++)
360
so->stride_in_dw[i] = info->so.strides[i];
361
362
so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
363
}
364
365
bool
366
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
367
{
368
return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
369
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
370
}
371
372
enum ring_type
373
radv_queue_family_to_ring(int f)
374
{
375
switch (f) {
376
case RADV_QUEUE_GENERAL:
377
return RING_GFX;
378
case RADV_QUEUE_COMPUTE:
379
return RING_COMPUTE;
380
case RADV_QUEUE_TRANSFER:
381
return RING_DMA;
382
default:
383
unreachable("Unknown queue family");
384
}
385
}
386
387
static void
388
radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
389
{
390
list_del(&cmd_buffer->pool_link);
391
392
list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
393
{
394
cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
395
list_del(&up->list);
396
free(up);
397
}
398
399
if (cmd_buffer->upload.upload_bo)
400
cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
401
402
if (cmd_buffer->cs)
403
cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
404
405
for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
406
free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
407
408
vk_object_base_finish(&cmd_buffer->base);
409
vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
410
}
411
412
static VkResult
413
radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
414
VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
415
{
416
struct radv_cmd_buffer *cmd_buffer;
417
unsigned ring;
418
cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
419
if (cmd_buffer == NULL)
420
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
421
422
vk_object_base_init(&device->vk, &cmd_buffer->base, VK_OBJECT_TYPE_COMMAND_BUFFER);
423
424
cmd_buffer->device = device;
425
cmd_buffer->pool = pool;
426
cmd_buffer->level = level;
427
428
list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
429
cmd_buffer->queue_family_index = pool->queue_family_index;
430
431
ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
432
433
cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
434
if (!cmd_buffer->cs) {
435
radv_destroy_cmd_buffer(cmd_buffer);
436
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
437
}
438
439
*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
440
441
list_inithead(&cmd_buffer->upload.list);
442
443
return VK_SUCCESS;
444
}
445
446
static VkResult
447
radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
448
{
449
cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
450
451
list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
452
{
453
cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
454
list_del(&up->list);
455
free(up);
456
}
457
458
cmd_buffer->push_constant_stages = 0;
459
cmd_buffer->scratch_size_per_wave_needed = 0;
460
cmd_buffer->scratch_waves_wanted = 0;
461
cmd_buffer->compute_scratch_size_per_wave_needed = 0;
462
cmd_buffer->compute_scratch_waves_wanted = 0;
463
cmd_buffer->esgs_ring_size_needed = 0;
464
cmd_buffer->gsvs_ring_size_needed = 0;
465
cmd_buffer->tess_rings_needed = false;
466
cmd_buffer->gds_needed = false;
467
cmd_buffer->gds_oa_needed = false;
468
cmd_buffer->sample_positions_needed = false;
469
470
if (cmd_buffer->upload.upload_bo)
471
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
472
cmd_buffer->upload.offset = 0;
473
474
cmd_buffer->record_result = VK_SUCCESS;
475
476
memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
477
478
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
479
cmd_buffer->descriptors[i].dirty = 0;
480
cmd_buffer->descriptors[i].valid = 0;
481
cmd_buffer->descriptors[i].push_dirty = false;
482
}
483
484
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
485
cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
486
unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
487
unsigned fence_offset, eop_bug_offset;
488
void *fence_ptr;
489
490
radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
491
memset(fence_ptr, 0, 8);
492
493
cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
494
cmd_buffer->gfx9_fence_va += fence_offset;
495
496
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
497
/* Allocate a buffer for the EOP bug on GFX9. */
498
radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
499
memset(fence_ptr, 0, 16 * num_db);
500
cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
501
cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
502
}
503
}
504
505
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
506
507
return cmd_buffer->record_result;
508
}
509
510
static bool
511
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
512
{
513
uint64_t new_size;
514
struct radeon_winsys_bo *bo = NULL;
515
struct radv_cmd_buffer_upload *upload;
516
struct radv_device *device = cmd_buffer->device;
517
518
new_size = MAX2(min_needed, 16 * 1024);
519
new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
520
521
VkResult result =
522
device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
523
RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
524
RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
525
RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
526
527
if (result != VK_SUCCESS) {
528
cmd_buffer->record_result = result;
529
return false;
530
}
531
532
radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
533
if (cmd_buffer->upload.upload_bo) {
534
upload = malloc(sizeof(*upload));
535
536
if (!upload) {
537
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
538
device->ws->buffer_destroy(device->ws, bo);
539
return false;
540
}
541
542
memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
543
list_add(&upload->list, &cmd_buffer->upload.list);
544
}
545
546
cmd_buffer->upload.upload_bo = bo;
547
cmd_buffer->upload.size = new_size;
548
cmd_buffer->upload.offset = 0;
549
cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
550
551
if (!cmd_buffer->upload.map) {
552
cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
553
return false;
554
}
555
556
return true;
557
}
558
559
bool
560
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
561
unsigned *out_offset, void **ptr)
562
{
563
assert(size % 4 == 0);
564
565
struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
566
567
/* Align to the scalar cache line size if it results in this allocation
568
* being placed in less of them.
569
*/
570
unsigned offset = cmd_buffer->upload.offset;
571
unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32;
572
unsigned gap = align(offset, line_size) - offset;
573
if ((size & (line_size - 1)) > gap)
574
offset = align(offset, line_size);
575
576
if (offset + size > cmd_buffer->upload.size) {
577
if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
578
return false;
579
offset = 0;
580
}
581
582
*out_offset = offset;
583
*ptr = cmd_buffer->upload.map + offset;
584
585
cmd_buffer->upload.offset = offset + size;
586
return true;
587
}
588
589
bool
590
radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
591
unsigned *out_offset)
592
{
593
uint8_t *ptr;
594
595
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
596
return false;
597
598
if (ptr)
599
memcpy(ptr, data, size);
600
601
return true;
602
}
603
604
static void
605
radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va, unsigned count,
606
const uint32_t *data)
607
{
608
struct radeon_cmdbuf *cs = cmd_buffer->cs;
609
610
radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
611
612
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
613
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
614
radeon_emit(cs, va);
615
radeon_emit(cs, va >> 32);
616
radeon_emit_array(cs, data, count);
617
}
618
619
void
620
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
621
{
622
struct radv_device *device = cmd_buffer->device;
623
struct radeon_cmdbuf *cs = cmd_buffer->cs;
624
uint64_t va;
625
626
va = radv_buffer_get_va(device->trace_bo);
627
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
628
va += 4;
629
630
++cmd_buffer->state.trace_id;
631
radv_emit_write_data_packet(cmd_buffer, va, 1, &cmd_buffer->state.trace_id);
632
633
radeon_check_space(cmd_buffer->device->ws, cs, 2);
634
635
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
636
radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
637
}
638
639
static void
640
radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
641
{
642
if (unlikely(cmd_buffer->device->thread_trace.bo)) {
643
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
644
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
645
}
646
647
if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
648
enum rgp_flush_bits sqtt_flush_bits = 0;
649
assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
650
651
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
652
653
/* Force wait for graphics or compute engines to be idle. */
654
si_cs_emit_cache_flush(cmd_buffer->cs,
655
cmd_buffer->device->physical_device->rad_info.chip_class,
656
&cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
657
radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
658
cmd_buffer->gfx9_eop_bug_va);
659
}
660
661
if (unlikely(cmd_buffer->device->trace_bo))
662
radv_cmd_buffer_trace_emit(cmd_buffer);
663
}
664
665
static void
666
radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
667
{
668
struct radv_device *device = cmd_buffer->device;
669
enum ring_type ring;
670
uint32_t data[2];
671
uint64_t va;
672
673
va = radv_buffer_get_va(device->trace_bo);
674
675
ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
676
677
switch (ring) {
678
case RING_GFX:
679
va += 8;
680
break;
681
case RING_COMPUTE:
682
va += 16;
683
break;
684
default:
685
assert(!"invalid ring type");
686
}
687
688
uint64_t pipeline_address = (uintptr_t)pipeline;
689
data[0] = pipeline_address;
690
data[1] = pipeline_address >> 32;
691
692
radv_emit_write_data_packet(cmd_buffer, va, 2, data);
693
}
694
695
static void
696
radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
697
{
698
struct radv_device *device = cmd_buffer->device;
699
uint32_t data[2];
700
uint64_t va;
701
702
va = radv_buffer_get_va(device->trace_bo);
703
va += 24;
704
705
data[0] = vb_ptr;
706
data[1] = vb_ptr >> 32;
707
708
radv_emit_write_data_packet(cmd_buffer, va, 2, data);
709
}
710
711
void
712
radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
713
struct radv_descriptor_set *set, unsigned idx)
714
{
715
struct radv_descriptor_state *descriptors_state =
716
radv_get_descriptors_state(cmd_buffer, bind_point);
717
718
descriptors_state->sets[idx] = set;
719
720
descriptors_state->valid |= (1u << idx); /* active descriptors */
721
descriptors_state->dirty |= (1u << idx);
722
}
723
724
static void
725
radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
726
{
727
struct radv_descriptor_state *descriptors_state =
728
radv_get_descriptors_state(cmd_buffer, bind_point);
729
struct radv_device *device = cmd_buffer->device;
730
uint32_t data[MAX_SETS * 2] = {0};
731
uint64_t va;
732
va = radv_buffer_get_va(device->trace_bo) + 32;
733
734
u_foreach_bit(i, descriptors_state->valid)
735
{
736
struct radv_descriptor_set *set = descriptors_state->sets[i];
737
data[i * 2] = (uint64_t)(uintptr_t)set;
738
data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
739
}
740
741
radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
742
}
743
744
struct radv_userdata_info *
745
radv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
746
{
747
struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
748
return &shader->info.user_sgprs_locs.shader_data[idx];
749
}
750
751
static void
752
radv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
753
gl_shader_stage stage, int idx, uint64_t va)
754
{
755
struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
756
uint32_t base_reg = pipeline->user_data_0[stage];
757
if (loc->sgpr_idx == -1)
758
return;
759
760
assert(loc->num_sgprs == 1);
761
762
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
763
false);
764
}
765
766
static void
767
radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
768
struct radv_descriptor_state *descriptors_state,
769
gl_shader_stage stage)
770
{
771
struct radv_device *device = cmd_buffer->device;
772
struct radeon_cmdbuf *cs = cmd_buffer->cs;
773
uint32_t sh_base = pipeline->user_data_0[stage];
774
struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
775
unsigned mask = locs->descriptor_sets_enabled;
776
777
mask &= descriptors_state->dirty & descriptors_state->valid;
778
779
while (mask) {
780
int start, count;
781
782
u_bit_scan_consecutive_range(&mask, &start, &count);
783
784
struct radv_userdata_info *loc = &locs->descriptor_sets[start];
785
unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
786
787
radv_emit_shader_pointer_head(cs, sh_offset, count, true);
788
for (int i = 0; i < count; i++) {
789
struct radv_descriptor_set *set = descriptors_state->sets[start + i];
790
791
radv_emit_shader_pointer_body(device, cs, set->header.va, true);
792
}
793
}
794
}
795
796
/**
797
* Convert the user sample locations to hardware sample locations (the values
798
* that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
799
*/
800
static void
801
radv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
802
VkOffset2D *sample_locs)
803
{
804
uint32_t x_offset = x % state->grid_size.width;
805
uint32_t y_offset = y % state->grid_size.height;
806
uint32_t num_samples = (uint32_t)state->per_pixel;
807
VkSampleLocationEXT *user_locs;
808
uint32_t pixel_offset;
809
810
pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
811
812
assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
813
user_locs = &state->locations[pixel_offset];
814
815
for (uint32_t i = 0; i < num_samples; i++) {
816
float shifted_pos_x = user_locs[i].x - 0.5;
817
float shifted_pos_y = user_locs[i].y - 0.5;
818
819
int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
820
int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
821
822
sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
823
sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
824
}
825
}
826
827
/**
828
* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
829
* locations.
830
*/
831
static void
832
radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
833
uint32_t *sample_locs_pixel)
834
{
835
for (uint32_t i = 0; i < num_samples; i++) {
836
uint32_t sample_reg_idx = i / 4;
837
uint32_t sample_loc_idx = i % 4;
838
int32_t pos_x = sample_locs[i].x;
839
int32_t pos_y = sample_locs[i].y;
840
841
uint32_t shift_x = 8 * sample_loc_idx;
842
uint32_t shift_y = shift_x + 4;
843
844
sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
845
sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
846
}
847
}
848
849
/**
850
* Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
851
* sample locations.
852
*/
853
static uint64_t
854
radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
855
uint32_t num_samples)
856
{
857
uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
858
uint32_t sample_mask = num_samples - 1;
859
uint32_t *distances = alloca(num_samples * sizeof(*distances));
860
uint64_t centroid_priority = 0;
861
862
/* Compute the distances from center for each sample. */
863
for (int i = 0; i < num_samples; i++) {
864
distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
865
}
866
867
/* Compute the centroid priorities by looking at the distances array. */
868
for (int i = 0; i < num_samples; i++) {
869
uint32_t min_idx = 0;
870
871
for (int j = 1; j < num_samples; j++) {
872
if (distances[j] < distances[min_idx])
873
min_idx = j;
874
}
875
876
centroid_priorities[i] = min_idx;
877
distances[min_idx] = 0xffffffff;
878
}
879
880
/* Compute the final centroid priority. */
881
for (int i = 0; i < 8; i++) {
882
centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
883
}
884
885
return centroid_priority << 32 | centroid_priority;
886
}
887
888
/**
889
* Emit the sample locations that are specified with VK_EXT_sample_locations.
890
*/
891
static void
892
radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
893
{
894
struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
895
uint32_t num_samples = (uint32_t)sample_location->per_pixel;
896
struct radeon_cmdbuf *cs = cmd_buffer->cs;
897
uint32_t sample_locs_pixel[4][2] = {0};
898
VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
899
uint32_t max_sample_dist = 0;
900
uint64_t centroid_priority;
901
902
if (!cmd_buffer->state.dynamic.sample_location.count)
903
return;
904
905
/* Convert the user sample locations to hardware sample locations. */
906
radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
907
radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
908
radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
909
radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
910
911
/* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
912
for (uint32_t i = 0; i < 4; i++) {
913
radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
914
}
915
916
/* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
917
centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
918
919
/* Compute the maximum sample distance from the specified locations. */
920
for (unsigned i = 0; i < 4; ++i) {
921
for (uint32_t j = 0; j < num_samples; j++) {
922
VkOffset2D offset = sample_locs[i][j];
923
max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
924
}
925
}
926
927
/* Emit the specified user sample locations. */
928
switch (num_samples) {
929
case 2:
930
case 4:
931
radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
932
sample_locs_pixel[0][0]);
933
radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
934
sample_locs_pixel[1][0]);
935
radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
936
sample_locs_pixel[2][0]);
937
radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
938
sample_locs_pixel[3][0]);
939
break;
940
case 8:
941
radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
942
sample_locs_pixel[0][0]);
943
radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
944
sample_locs_pixel[1][0]);
945
radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
946
sample_locs_pixel[2][0]);
947
radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
948
sample_locs_pixel[3][0]);
949
radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
950
sample_locs_pixel[0][1]);
951
radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
952
sample_locs_pixel[1][1]);
953
radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
954
sample_locs_pixel[2][1]);
955
radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
956
sample_locs_pixel[3][1]);
957
break;
958
default:
959
unreachable("invalid number of samples");
960
}
961
962
/* Emit the maximum sample distance and the centroid priority. */
963
radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
964
S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
965
966
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
967
radeon_emit(cs, centroid_priority);
968
radeon_emit(cs, centroid_priority >> 32);
969
970
cmd_buffer->state.context_roll_without_scissor_emitted = true;
971
}
972
973
static void
974
radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
975
gl_shader_stage stage, int idx, int count, uint32_t *values)
976
{
977
struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
978
uint32_t base_reg = pipeline->user_data_0[stage];
979
if (loc->sgpr_idx == -1)
980
return;
981
982
assert(loc->num_sgprs == count);
983
984
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + count);
985
986
radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
987
radeon_emit_array(cmd_buffer->cs, values, count);
988
}
989
990
static void
991
radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
992
{
993
int num_samples = pipeline->graphics.ms.num_samples;
994
struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
995
996
if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
997
cmd_buffer->sample_positions_needed = true;
998
999
if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
1000
return;
1001
1002
radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
1003
1004
cmd_buffer->state.context_roll_without_scissor_emitted = true;
1005
}
1006
1007
static void
1008
radv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
1009
{
1010
const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
1011
1012
if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
1013
return;
1014
1015
if (old_pipeline &&
1016
old_pipeline->graphics.binning.pa_sc_binner_cntl_0 ==
1017
pipeline->graphics.binning.pa_sc_binner_cntl_0)
1018
return;
1019
1020
bool binning_flush = false;
1021
if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
1022
cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
1023
cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
1024
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1025
binning_flush = !old_pipeline ||
1026
G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
1027
G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
1028
}
1029
1030
radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
1031
pipeline->graphics.binning.pa_sc_binner_cntl_0 |
1032
S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
1033
1034
cmd_buffer->state.context_roll_without_scissor_emitted = true;
1035
}
1036
1037
static void
1038
radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader)
1039
{
1040
uint64_t va;
1041
1042
if (!shader)
1043
return;
1044
1045
va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
1046
1047
si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1048
}
1049
1050
static void
1051
radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
1052
bool vertex_stage_only)
1053
{
1054
struct radv_cmd_state *state = &cmd_buffer->state;
1055
uint32_t mask = state->prefetch_L2_mask;
1056
1057
if (vertex_stage_only) {
1058
/* Fast prefetch path for starting draws as soon as possible.
1059
*/
1060
mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS);
1061
}
1062
1063
if (mask & RADV_PREFETCH_VS)
1064
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]);
1065
1066
if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1067
si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
1068
1069
if (mask & RADV_PREFETCH_TCS)
1070
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]);
1071
1072
if (mask & RADV_PREFETCH_TES)
1073
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]);
1074
1075
if (mask & RADV_PREFETCH_GS) {
1076
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]);
1077
if (radv_pipeline_has_gs_copy_shader(pipeline))
1078
radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
1079
}
1080
1081
if (mask & RADV_PREFETCH_PS)
1082
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]);
1083
1084
state->prefetch_L2_mask &= ~mask;
1085
}
1086
1087
static void
1088
radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1089
{
1090
if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
1091
return;
1092
1093
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1094
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1095
1096
unsigned sx_ps_downconvert = 0;
1097
unsigned sx_blend_opt_epsilon = 0;
1098
unsigned sx_blend_opt_control = 0;
1099
1100
if (!cmd_buffer->state.attachments || !subpass)
1101
return;
1102
1103
for (unsigned i = 0; i < subpass->color_count; ++i) {
1104
if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
1105
/* We don't set the DISABLE bits, because the HW can't have holes,
1106
* so the SPI color format is set to 32-bit 1-component. */
1107
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1108
continue;
1109
}
1110
1111
int idx = subpass->color_attachments[i].attachment;
1112
struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
1113
1114
unsigned format = G_028C70_FORMAT(cb->cb_color_info);
1115
unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
1116
uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
1117
uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
1118
1119
bool has_alpha, has_rgb;
1120
1121
/* Set if RGB and A are present. */
1122
has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
1123
1124
if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1125
has_rgb = !has_alpha;
1126
else
1127
has_rgb = true;
1128
1129
/* Check the colormask and export format. */
1130
if (!(colormask & 0x7))
1131
has_rgb = false;
1132
if (!(colormask & 0x8))
1133
has_alpha = false;
1134
1135
if (spi_format == V_028714_SPI_SHADER_ZERO) {
1136
has_rgb = false;
1137
has_alpha = false;
1138
}
1139
1140
/* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1141
* optimization, even though it has no alpha. */
1142
if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1143
has_alpha = true;
1144
1145
/* Disable value checking for disabled channels. */
1146
if (!has_rgb)
1147
sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1148
if (!has_alpha)
1149
sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1150
1151
/* Enable down-conversion for 32bpp and smaller formats. */
1152
switch (format) {
1153
case V_028C70_COLOR_8:
1154
case V_028C70_COLOR_8_8:
1155
case V_028C70_COLOR_8_8_8_8:
1156
/* For 1 and 2-channel formats, use the superset thereof. */
1157
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
1158
spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1159
spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1160
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1161
sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
1162
}
1163
break;
1164
1165
case V_028C70_COLOR_5_6_5:
1166
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1167
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1168
sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
1169
}
1170
break;
1171
1172
case V_028C70_COLOR_1_5_5_5:
1173
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1174
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1175
sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
1176
}
1177
break;
1178
1179
case V_028C70_COLOR_4_4_4_4:
1180
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1181
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1182
sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
1183
}
1184
break;
1185
1186
case V_028C70_COLOR_32:
1187
if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1188
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1189
else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1190
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1191
break;
1192
1193
case V_028C70_COLOR_16:
1194
case V_028C70_COLOR_16_16:
1195
/* For 1-channel formats, use the superset thereof. */
1196
if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
1197
spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1198
spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1199
spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1200
if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1201
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1202
else
1203
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1204
}
1205
break;
1206
1207
case V_028C70_COLOR_10_11_11:
1208
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1209
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1210
break;
1211
1212
case V_028C70_COLOR_2_10_10_10:
1213
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1214
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1215
sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
1216
}
1217
break;
1218
case V_028C70_COLOR_5_9_9_9:
1219
if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1220
sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1221
break;
1222
}
1223
}
1224
1225
/* Do not set the DISABLE bits for the unused attachments, as that
1226
* breaks dual source blending in SkQP and does not seem to improve
1227
* performance. */
1228
1229
if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
1230
sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
1231
sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
1232
return;
1233
1234
radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
1235
radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
1236
radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
1237
radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
1238
1239
cmd_buffer->state.context_roll_without_scissor_emitted = true;
1240
1241
cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
1242
cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
1243
cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
1244
}
1245
1246
static void
1247
radv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
1248
{
1249
if (!cmd_buffer->device->pbb_allowed)
1250
return;
1251
1252
struct radv_binning_settings settings =
1253
radv_get_binning_settings(cmd_buffer->device->physical_device);
1254
bool break_for_new_ps =
1255
(!cmd_buffer->state.emitted_pipeline ||
1256
cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
1257
cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
1258
(settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
1259
bool break_for_new_cb_target_mask =
1260
(cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
1261
settings.context_states_per_bin > 1;
1262
1263
if (!break_for_new_ps && !break_for_new_cb_target_mask)
1264
return;
1265
1266
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1267
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
1268
}
1269
1270
static void
1271
radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
1272
{
1273
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1274
1275
if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
1276
return;
1277
1278
radv_update_multisample_state(cmd_buffer, pipeline);
1279
radv_update_binning_state(cmd_buffer, pipeline);
1280
1281
cmd_buffer->scratch_size_per_wave_needed =
1282
MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
1283
cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves);
1284
1285
if (!cmd_buffer->state.emitted_pipeline ||
1286
cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
1287
pipeline->graphics.can_use_guardband)
1288
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
1289
1290
if (!cmd_buffer->state.emitted_pipeline ||
1291
cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl !=
1292
pipeline->graphics.pa_su_sc_mode_cntl)
1293
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
1294
RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
1295
RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
1296
1297
if (!cmd_buffer->state.emitted_pipeline ||
1298
cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl !=
1299
pipeline->graphics.pa_cl_clip_cntl)
1300
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
1301
1302
if (!cmd_buffer->state.emitted_pipeline ||
1303
cmd_buffer->state.emitted_pipeline->graphics.cb_color_control !=
1304
pipeline->graphics.cb_color_control)
1305
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
1306
1307
if (!cmd_buffer->state.emitted_pipeline)
1308
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
1309
RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
1310
RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
1311
RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
1312
1313
if (!cmd_buffer->state.emitted_pipeline ||
1314
cmd_buffer->state.emitted_pipeline->graphics.db_depth_control !=
1315
pipeline->graphics.db_depth_control)
1316
cmd_buffer->state.dirty |=
1317
RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
1318
RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
1319
RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1320
1321
if (!cmd_buffer->state.emitted_pipeline)
1322
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1323
1324
if (!cmd_buffer->state.emitted_pipeline ||
1325
cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
1326
pipeline->graphics.cb_target_mask) {
1327
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
1328
}
1329
1330
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
1331
1332
if (pipeline->graphics.has_ngg_culling &&
1333
pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
1334
!cmd_buffer->state.last_nggc_settings) {
1335
/* The already emitted RSRC2 contains the LDS required for NGG culling.
1336
* Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
1337
* API GS always needs LDS, so this isn't useful there.
1338
*/
1339
struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
1340
radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
1341
(v->config.rsrc2 & C_00B22C_LDS_SIZE) |
1342
S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
1343
}
1344
1345
if (!cmd_buffer->state.emitted_pipeline ||
1346
cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
1347
cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
1348
memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf,
1349
pipeline->ctx_cs.cdw * 4)) {
1350
radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
1351
cmd_buffer->state.context_roll_without_scissor_emitted = true;
1352
}
1353
1354
radv_emit_batch_break_on_new_ps(cmd_buffer);
1355
1356
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
1357
if (!pipeline->shaders[i])
1358
continue;
1359
1360
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
1361
}
1362
1363
if (radv_pipeline_has_gs_copy_shader(pipeline))
1364
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
1365
1366
if (unlikely(cmd_buffer->device->trace_bo))
1367
radv_save_pipeline(cmd_buffer, pipeline);
1368
1369
cmd_buffer->state.emitted_pipeline = pipeline;
1370
1371
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
1372
}
1373
1374
static void
1375
radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
1376
{
1377
si_write_viewport(cmd_buffer->cs, 0, cmd_buffer->state.dynamic.viewport.count,
1378
cmd_buffer->state.dynamic.viewport.viewports);
1379
}
1380
1381
static void
1382
radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
1383
{
1384
uint32_t count = cmd_buffer->state.dynamic.scissor.count;
1385
1386
si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
1387
cmd_buffer->state.dynamic.viewport.viewports,
1388
cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1389
1390
cmd_buffer->state.context_roll_without_scissor_emitted = false;
1391
}
1392
1393
static void
1394
radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
1395
{
1396
if (!cmd_buffer->state.dynamic.discard_rectangle.count)
1397
return;
1398
1399
radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
1400
cmd_buffer->state.dynamic.discard_rectangle.count * 2);
1401
for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
1402
VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
1403
radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
1404
radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
1405
S_028214_BR_Y(rect.offset.y + rect.extent.height));
1406
}
1407
}
1408
1409
static void
1410
radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
1411
{
1412
unsigned width = cmd_buffer->state.dynamic.line_width * 8;
1413
1414
radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
1415
S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
1416
}
1417
1418
static void
1419
radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
1420
{
1421
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1422
1423
radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
1424
radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
1425
}
1426
1427
static void
1428
radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
1429
{
1430
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1431
1432
radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
1433
radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
1434
S_028430_STENCILMASK(d->stencil_compare_mask.front) |
1435
S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
1436
S_028430_STENCILOPVAL(1));
1437
radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
1438
S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
1439
S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
1440
S_028434_STENCILOPVAL_BF(1));
1441
}
1442
1443
static void
1444
radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
1445
{
1446
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1447
1448
radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, fui(d->depth_bounds.min));
1449
radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_DEPTH_BOUNDS_MAX, fui(d->depth_bounds.max));
1450
}
1451
1452
static void
1453
radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
1454
{
1455
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1456
unsigned slope = fui(d->depth_bias.slope * 16.0f);
1457
1458
radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
1459
radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
1460
radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
1461
radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* FRONT OFFSET */
1462
radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
1463
radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias)); /* BACK OFFSET */
1464
}
1465
1466
static void
1467
radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
1468
{
1469
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1470
uint32_t auto_reset_cntl = 1;
1471
1472
if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
1473
auto_reset_cntl = 2;
1474
1475
radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
1476
S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
1477
S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
1478
S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
1479
}
1480
1481
static void
1482
radv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1483
{
1484
unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
1485
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1486
1487
pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
1488
C_028814_CULL_BACK &
1489
C_028814_FACE &
1490
C_028814_POLY_OFFSET_FRONT_ENABLE &
1491
C_028814_POLY_OFFSET_BACK_ENABLE &
1492
C_028814_POLY_OFFSET_PARA_ENABLE;
1493
1494
pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
1495
S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
1496
S_028814_FACE(d->front_face) |
1497
S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
1498
S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
1499
S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
1500
1501
radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
1502
}
1503
1504
static void
1505
radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
1506
{
1507
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1508
1509
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
1510
radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
1511
R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
1512
} else {
1513
radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
1514
}
1515
}
1516
1517
static void
1518
radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
1519
{
1520
unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control;
1521
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1522
1523
db_depth_control &= C_028800_Z_ENABLE &
1524
C_028800_Z_WRITE_ENABLE &
1525
C_028800_ZFUNC &
1526
C_028800_DEPTH_BOUNDS_ENABLE &
1527
C_028800_STENCIL_ENABLE &
1528
C_028800_BACKFACE_ENABLE &
1529
C_028800_STENCILFUNC &
1530
C_028800_STENCILFUNC_BF;
1531
1532
db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
1533
S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
1534
S_028800_ZFUNC(d->depth_compare_op) |
1535
S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
1536
S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
1537
S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
1538
S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
1539
S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
1540
1541
radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1542
}
1543
1544
static void
1545
radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
1546
{
1547
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1548
1549
radeon_set_context_reg(
1550
cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
1551
S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
1552
S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
1553
S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
1554
S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
1555
S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
1556
S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
1557
}
1558
1559
static void
1560
radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
1561
{
1562
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1563
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1564
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1565
uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
1566
uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
1567
uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl;
1568
uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0];
1569
uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
1570
1571
if (subpass && !subpass->vrs_attachment) {
1572
/* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
1573
* can cheat by tweaking the different combiner modes.
1574
*/
1575
switch (htile_comb_mode) {
1576
case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
1577
/* The result of min(A, 1x1) is always 1x1. */
1578
FALLTHROUGH;
1579
case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
1580
/* Force the per-draw VRS rate to 1x1. */
1581
rate_x = rate_y = 0;
1582
1583
/* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
1584
* combiner mode as passthrough.
1585
*/
1586
vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
1587
break;
1588
case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
1589
/* The result of max(A, 1x1) is always A. */
1590
FALLTHROUGH;
1591
case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
1592
/* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
1593
break;
1594
default:
1595
break;
1596
}
1597
}
1598
1599
/* Emit per-draw VRS rate which is the first combiner. */
1600
radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
1601
S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
1602
1603
/* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
1604
* draw rate and the vertex rate.
1605
*/
1606
pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode);
1607
1608
/* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
1609
* rate.
1610
*/
1611
pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
1612
1613
radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
1614
}
1615
1616
static void
1617
radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
1618
{
1619
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1620
1621
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1622
radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
1623
d->primitive_restart_enable);
1624
} else {
1625
radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
1626
d->primitive_restart_enable);
1627
}
1628
}
1629
1630
static void
1631
radv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
1632
{
1633
unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl;
1634
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1635
1636
pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
1637
pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
1638
1639
radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
1640
}
1641
1642
static void
1643
radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
1644
{
1645
unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control;
1646
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1647
1648
cb_color_control &= C_028808_ROP3;
1649
cb_color_control |= S_028808_ROP3(d->logic_op);
1650
1651
radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
1652
}
1653
1654
static void
1655
radv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
1656
{
1657
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
1658
struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1659
1660
radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
1661
pipeline->graphics.cb_target_mask & d->color_write_enable);
1662
}
1663
1664
static void
1665
radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
1666
struct radv_color_buffer_info *cb, struct radv_image_view *iview,
1667
VkImageLayout layout, bool in_render_loop, bool disable_dcc)
1668
{
1669
bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
1670
uint32_t cb_color_info = cb->cb_color_info;
1671
struct radv_image *image = iview->image;
1672
1673
if (!radv_layout_dcc_compressed(
1674
cmd_buffer->device, image, iview->base_mip, layout, in_render_loop,
1675
radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1676
cmd_buffer->queue_family_index)) ||
1677
disable_dcc) {
1678
cb_color_info &= C_028C70_DCC_ENABLE;
1679
}
1680
1681
if (!radv_layout_fmask_compressed(
1682
cmd_buffer->device, image, layout,
1683
radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1684
cmd_buffer->queue_family_index))) {
1685
cb_color_info &= C_028C70_COMPRESSION;
1686
}
1687
1688
if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
1689
radv_is_dcc_decompress_pipeline(cmd_buffer))) {
1690
/* If this bit is set, the FMASK decompression operation
1691
* doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
1692
*/
1693
cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
1694
}
1695
1696
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1697
radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1698
radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1699
radeon_emit(cmd_buffer->cs, 0);
1700
radeon_emit(cmd_buffer->cs, 0);
1701
radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1702
radeon_emit(cmd_buffer->cs, cb_color_info);
1703
radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1704
radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1705
radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1706
radeon_emit(cmd_buffer->cs, 0);
1707
radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1708
radeon_emit(cmd_buffer->cs, 0);
1709
1710
radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 1);
1711
radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1712
1713
radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
1714
cb->cb_color_base >> 32);
1715
radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
1716
cb->cb_color_cmask >> 32);
1717
radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
1718
cb->cb_color_fmask >> 32);
1719
radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
1720
cb->cb_dcc_base >> 32);
1721
radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
1722
cb->cb_color_attrib2);
1723
radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
1724
cb->cb_color_attrib3);
1725
} else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1726
radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1727
radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1728
radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
1729
radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
1730
radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1731
radeon_emit(cmd_buffer->cs, cb_color_info);
1732
radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1733
radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1734
radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1735
radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
1736
radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1737
radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
1738
1739
radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
1740
radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
1741
radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
1742
1743
radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
1744
cb->cb_mrt_epitch);
1745
} else {
1746
radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
1747
radeon_emit(cmd_buffer->cs, cb->cb_color_base);
1748
radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
1749
radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
1750
radeon_emit(cmd_buffer->cs, cb->cb_color_view);
1751
radeon_emit(cmd_buffer->cs, cb_color_info);
1752
radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
1753
radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
1754
radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
1755
radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
1756
radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
1757
radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
1758
1759
if (is_vi) { /* DCC BASE */
1760
radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
1761
cb->cb_dcc_base);
1762
}
1763
}
1764
1765
if (G_028C70_DCC_ENABLE(cb_color_info)) {
1766
/* Drawing with DCC enabled also compresses colorbuffers. */
1767
VkImageSubresourceRange range = {
1768
.aspectMask = iview->aspect_mask,
1769
.baseMipLevel = iview->base_mip,
1770
.levelCount = iview->level_count,
1771
.baseArrayLayer = iview->base_layer,
1772
.layerCount = iview->layer_count,
1773
};
1774
1775
radv_update_dcc_metadata(cmd_buffer, image, &range, true);
1776
}
1777
}
1778
1779
static void
1780
radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1781
const struct radv_image_view *iview, VkImageLayout layout,
1782
bool in_render_loop, bool requires_cond_exec)
1783
{
1784
const struct radv_image *image = iview->image;
1785
uint32_t db_z_info = ds->db_z_info;
1786
uint32_t db_z_info_reg;
1787
1788
if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
1789
!radv_image_is_tc_compat_htile(image))
1790
return;
1791
1792
if (!radv_layout_is_htile_compressed(
1793
cmd_buffer->device, image, layout, in_render_loop,
1794
radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1795
cmd_buffer->queue_family_index))) {
1796
db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1797
}
1798
1799
db_z_info &= C_028040_ZRANGE_PRECISION;
1800
1801
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1802
db_z_info_reg = R_028038_DB_Z_INFO;
1803
} else {
1804
db_z_info_reg = R_028040_DB_Z_INFO;
1805
}
1806
1807
/* When we don't know the last fast clear value we need to emit a
1808
* conditional packet that will eventually skip the following
1809
* SET_CONTEXT_REG packet.
1810
*/
1811
if (requires_cond_exec) {
1812
uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
1813
1814
radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
1815
radeon_emit(cmd_buffer->cs, va);
1816
radeon_emit(cmd_buffer->cs, va >> 32);
1817
radeon_emit(cmd_buffer->cs, 0);
1818
radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
1819
}
1820
1821
radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
1822
}
1823
1824
static void
1825
radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
1826
struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
1827
{
1828
const struct radv_image *image = iview->image;
1829
uint32_t db_z_info = ds->db_z_info;
1830
uint32_t db_stencil_info = ds->db_stencil_info;
1831
1832
if (!radv_layout_is_htile_compressed(
1833
cmd_buffer->device, image, layout, in_render_loop,
1834
radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
1835
cmd_buffer->queue_family_index))) {
1836
db_z_info &= C_028040_TILE_SURFACE_ENABLE;
1837
db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
1838
}
1839
1840
radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
1841
radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
1842
1843
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
1844
radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1845
radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
1846
1847
radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
1848
radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
1849
radeon_emit(cmd_buffer->cs, db_z_info);
1850
radeon_emit(cmd_buffer->cs, db_stencil_info);
1851
radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1852
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1853
radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
1854
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
1855
1856
radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
1857
radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1858
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1859
radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
1860
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
1861
radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
1862
} else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
1863
radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
1864
radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
1865
radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
1866
radeon_emit(cmd_buffer->cs, ds->db_depth_size);
1867
1868
radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
1869
radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
1870
radeon_emit(cmd_buffer->cs, db_stencil_info); /* DB_STENCIL_INFO */
1871
radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
1872
radeon_emit(cmd_buffer->cs,
1873
S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
1874
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* DB_STENCIL_READ_BASE */
1875
radeon_emit(cmd_buffer->cs,
1876
S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
1877
radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* DB_Z_WRITE_BASE */
1878
radeon_emit(cmd_buffer->cs,
1879
S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
1880
radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* DB_STENCIL_WRITE_BASE */
1881
radeon_emit(cmd_buffer->cs,
1882
S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
1883
1884
radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
1885
radeon_emit(cmd_buffer->cs, ds->db_z_info2);
1886
radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
1887
} else {
1888
radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
1889
1890
radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
1891
radeon_emit(cmd_buffer->cs, ds->db_depth_info); /* R_02803C_DB_DEPTH_INFO */
1892
radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
1893
radeon_emit(cmd_buffer->cs, db_stencil_info); /* R_028044_DB_STENCIL_INFO */
1894
radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* R_028048_DB_Z_READ_BASE */
1895
radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base); /* R_02804C_DB_STENCIL_READ_BASE */
1896
radeon_emit(cmd_buffer->cs, ds->db_z_write_base); /* R_028050_DB_Z_WRITE_BASE */
1897
radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
1898
radeon_emit(cmd_buffer->cs, ds->db_depth_size); /* R_028058_DB_DEPTH_SIZE */
1899
radeon_emit(cmd_buffer->cs, ds->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
1900
}
1901
1902
/* Update the ZRANGE_PRECISION value for the TC-compat bug. */
1903
radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
1904
1905
radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
1906
ds->pa_su_poly_offset_db_fmt_cntl);
1907
}
1908
1909
/**
1910
* Update the fast clear depth/stencil values if the image is bound as a
1911
* depth/stencil buffer.
1912
*/
1913
static void
1914
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
1915
const struct radv_image_view *iview,
1916
VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
1917
{
1918
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
1919
const struct radv_image *image = iview->image;
1920
struct radeon_cmdbuf *cs = cmd_buffer->cs;
1921
uint32_t att_idx;
1922
1923
if (!cmd_buffer->state.attachments || !subpass)
1924
return;
1925
1926
if (!subpass->depth_stencil_attachment)
1927
return;
1928
1929
att_idx = subpass->depth_stencil_attachment->attachment;
1930
if (cmd_buffer->state.attachments[att_idx].iview->image != image)
1931
return;
1932
1933
if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1934
radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
1935
radeon_emit(cs, ds_clear_value.stencil);
1936
radeon_emit(cs, fui(ds_clear_value.depth));
1937
} else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1938
radeon_set_context_reg_seq(cs, R_02802C_DB_DEPTH_CLEAR, 1);
1939
radeon_emit(cs, fui(ds_clear_value.depth));
1940
} else {
1941
assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1942
radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 1);
1943
radeon_emit(cs, ds_clear_value.stencil);
1944
}
1945
1946
/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
1947
* only needed when clearing Z to 0.0.
1948
*/
1949
if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
1950
VkImageLayout layout = subpass->depth_stencil_attachment->layout;
1951
bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
1952
1953
radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
1954
layout, in_render_loop, false);
1955
}
1956
1957
cmd_buffer->state.context_roll_without_scissor_emitted = true;
1958
}
1959
1960
/**
1961
* Set the clear depth/stencil values to the image's metadata.
1962
*/
1963
static void
1964
radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
1965
const VkImageSubresourceRange *range,
1966
VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
1967
{
1968
struct radeon_cmdbuf *cs = cmd_buffer->cs;
1969
uint32_t level_count = radv_get_levelCount(image, range);
1970
1971
if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1972
uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
1973
1974
/* Use the fastest way when both aspects are used. */
1975
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
1976
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
1977
radeon_emit(cs, va);
1978
radeon_emit(cs, va >> 32);
1979
1980
for (uint32_t l = 0; l < level_count; l++) {
1981
radeon_emit(cs, ds_clear_value.stencil);
1982
radeon_emit(cs, fui(ds_clear_value.depth));
1983
}
1984
} else {
1985
/* Otherwise we need one WRITE_DATA packet per level. */
1986
for (uint32_t l = 0; l < level_count; l++) {
1987
uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
1988
unsigned value;
1989
1990
if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
1991
value = fui(ds_clear_value.depth);
1992
va += 4;
1993
} else {
1994
assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
1995
value = ds_clear_value.stencil;
1996
}
1997
1998
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
1999
radeon_emit(cs,
2000
S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2001
radeon_emit(cs, va);
2002
radeon_emit(cs, va >> 32);
2003
radeon_emit(cs, value);
2004
}
2005
}
2006
}
2007
2008
/**
2009
* Update the TC-compat metadata value for this image.
2010
*/
2011
static void
2012
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2013
const VkImageSubresourceRange *range, uint32_t value)
2014
{
2015
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2016
2017
if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
2018
return;
2019
2020
uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
2021
uint32_t level_count = radv_get_levelCount(image, range);
2022
2023
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
2024
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2025
radeon_emit(cs, va);
2026
radeon_emit(cs, va >> 32);
2027
2028
for (uint32_t l = 0; l < level_count; l++)
2029
radeon_emit(cs, value);
2030
}
2031
2032
static void
2033
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
2034
const struct radv_image_view *iview,
2035
VkClearDepthStencilValue ds_clear_value)
2036
{
2037
VkImageSubresourceRange range = {
2038
.aspectMask = iview->aspect_mask,
2039
.baseMipLevel = iview->base_mip,
2040
.levelCount = iview->level_count,
2041
.baseArrayLayer = iview->base_layer,
2042
.layerCount = iview->layer_count,
2043
};
2044
uint32_t cond_val;
2045
2046
/* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
2047
* depth clear value is 0.0f.
2048
*/
2049
cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
2050
2051
radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
2052
}
2053
2054
/**
2055
* Update the clear depth/stencil values for this image.
2056
*/
2057
void
2058
radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2059
const struct radv_image_view *iview,
2060
VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
2061
{
2062
VkImageSubresourceRange range = {
2063
.aspectMask = iview->aspect_mask,
2064
.baseMipLevel = iview->base_mip,
2065
.levelCount = iview->level_count,
2066
.baseArrayLayer = iview->base_layer,
2067
.layerCount = iview->layer_count,
2068
};
2069
struct radv_image *image = iview->image;
2070
2071
assert(radv_htile_enabled(image, range.baseMipLevel));
2072
2073
radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
2074
2075
if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
2076
radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
2077
}
2078
2079
radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
2080
}
2081
2082
/**
2083
* Load the clear depth/stencil values from the image's metadata.
2084
*/
2085
static void
2086
radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
2087
{
2088
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2089
const struct radv_image *image = iview->image;
2090
VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
2091
uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
2092
unsigned reg_offset = 0, reg_count = 0;
2093
2094
assert(radv_image_has_htile(image));
2095
2096
if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
2097
++reg_count;
2098
} else {
2099
++reg_offset;
2100
va += 4;
2101
}
2102
if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2103
++reg_count;
2104
2105
uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
2106
2107
if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2108
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
2109
radeon_emit(cs, va);
2110
radeon_emit(cs, va >> 32);
2111
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2112
radeon_emit(cs, reg_count);
2113
} else {
2114
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2115
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2116
(reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
2117
radeon_emit(cs, va);
2118
radeon_emit(cs, va >> 32);
2119
radeon_emit(cs, reg >> 2);
2120
radeon_emit(cs, 0);
2121
2122
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
2123
radeon_emit(cs, 0);
2124
}
2125
}
2126
2127
/*
2128
* With DCC some colors don't require CMASK elimination before being
2129
* used as a texture. This sets a predicate value to determine if the
2130
* cmask eliminate is required.
2131
*/
2132
void
2133
radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2134
const VkImageSubresourceRange *range, bool value)
2135
{
2136
if (!image->fce_pred_offset)
2137
return;
2138
2139
uint64_t pred_val = value;
2140
uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
2141
uint32_t level_count = radv_get_levelCount(image, range);
2142
uint32_t count = 2 * level_count;
2143
2144
radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2145
radeon_emit(cmd_buffer->cs,
2146
S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2147
radeon_emit(cmd_buffer->cs, va);
2148
radeon_emit(cmd_buffer->cs, va >> 32);
2149
2150
for (uint32_t l = 0; l < level_count; l++) {
2151
radeon_emit(cmd_buffer->cs, pred_val);
2152
radeon_emit(cmd_buffer->cs, pred_val >> 32);
2153
}
2154
}
2155
2156
/**
2157
* Update the DCC predicate to reflect the compression state.
2158
*/
2159
void
2160
radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2161
const VkImageSubresourceRange *range, bool value)
2162
{
2163
if (image->dcc_pred_offset == 0)
2164
return;
2165
2166
uint64_t pred_val = value;
2167
uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
2168
uint32_t level_count = radv_get_levelCount(image, range);
2169
uint32_t count = 2 * level_count;
2170
2171
assert(radv_dcc_enabled(image, range->baseMipLevel));
2172
2173
radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
2174
radeon_emit(cmd_buffer->cs,
2175
S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2176
radeon_emit(cmd_buffer->cs, va);
2177
radeon_emit(cmd_buffer->cs, va >> 32);
2178
2179
for (uint32_t l = 0; l < level_count; l++) {
2180
radeon_emit(cmd_buffer->cs, pred_val);
2181
radeon_emit(cmd_buffer->cs, pred_val >> 32);
2182
}
2183
}
2184
2185
/**
2186
* Update the fast clear color values if the image is bound as a color buffer.
2187
*/
2188
static void
2189
radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2190
int cb_idx, uint32_t color_values[2])
2191
{
2192
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2193
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2194
uint32_t att_idx;
2195
2196
if (!cmd_buffer->state.attachments || !subpass)
2197
return;
2198
2199
att_idx = subpass->color_attachments[cb_idx].attachment;
2200
if (att_idx == VK_ATTACHMENT_UNUSED)
2201
return;
2202
2203
if (cmd_buffer->state.attachments[att_idx].iview->image != image)
2204
return;
2205
2206
radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
2207
radeon_emit(cs, color_values[0]);
2208
radeon_emit(cs, color_values[1]);
2209
2210
cmd_buffer->state.context_roll_without_scissor_emitted = true;
2211
}
2212
2213
/**
2214
* Set the clear color values to the image's metadata.
2215
*/
2216
static void
2217
radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
2218
const VkImageSubresourceRange *range, uint32_t color_values[2])
2219
{
2220
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2221
uint32_t level_count = radv_get_levelCount(image, range);
2222
uint32_t count = 2 * level_count;
2223
2224
assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
2225
2226
if (radv_image_has_clear_value(image)) {
2227
uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
2228
2229
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
2230
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
2231
radeon_emit(cs, va);
2232
radeon_emit(cs, va >> 32);
2233
2234
for (uint32_t l = 0; l < level_count; l++) {
2235
radeon_emit(cs, color_values[0]);
2236
radeon_emit(cs, color_values[1]);
2237
}
2238
} else {
2239
/* Some default value we can set in the update. */
2240
assert(color_values[0] == 0 && color_values[1] == 0);
2241
}
2242
}
2243
2244
/**
2245
* Update the clear color values for this image.
2246
*/
2247
void
2248
radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
2249
const struct radv_image_view *iview, int cb_idx,
2250
uint32_t color_values[2])
2251
{
2252
struct radv_image *image = iview->image;
2253
VkImageSubresourceRange range = {
2254
.aspectMask = iview->aspect_mask,
2255
.baseMipLevel = iview->base_mip,
2256
.levelCount = iview->level_count,
2257
.baseArrayLayer = iview->base_layer,
2258
.layerCount = iview->layer_count,
2259
};
2260
2261
assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip));
2262
2263
radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
2264
2265
radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2266
}
2267
2268
/**
2269
* Load the clear color values from the image's metadata.
2270
*/
2271
static void
2272
radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
2273
int cb_idx)
2274
{
2275
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2276
struct radv_image *image = iview->image;
2277
2278
if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip))
2279
return;
2280
2281
if (!radv_image_has_clear_value(image)) {
2282
uint32_t color_values[2] = {0, 0};
2283
radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
2284
return;
2285
}
2286
2287
uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
2288
uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
2289
2290
if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
2291
radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
2292
radeon_emit(cs, va);
2293
radeon_emit(cs, va >> 32);
2294
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
2295
radeon_emit(cs, 2);
2296
} else {
2297
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
2298
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
2299
COPY_DATA_COUNT_SEL);
2300
radeon_emit(cs, va);
2301
radeon_emit(cs, va >> 32);
2302
radeon_emit(cs, reg >> 2);
2303
radeon_emit(cs, 0);
2304
2305
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
2306
radeon_emit(cs, 0);
2307
}
2308
}
2309
2310
/* GFX9+ metadata cache flushing workaround. metadata cache coherency is
2311
* broken if the CB caches data of multiple mips of the same image at the
2312
* same time.
2313
*
2314
* Insert some flushes to avoid this.
2315
*/
2316
static void
2317
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
2318
{
2319
struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2320
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2321
bool color_mip_changed = false;
2322
2323
/* Entire workaround is not applicable before GFX9 */
2324
if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2325
return;
2326
2327
if (!framebuffer)
2328
return;
2329
2330
for (int i = 0; i < subpass->color_count; ++i) {
2331
int idx = subpass->color_attachments[i].attachment;
2332
if (idx == VK_ATTACHMENT_UNUSED)
2333
continue;
2334
2335
struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2336
2337
if ((radv_image_has_CB_metadata(iview->image) ||
2338
radv_dcc_enabled(iview->image, iview->base_mip) ||
2339
radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
2340
cmd_buffer->state.cb_mip[i] != iview->base_mip)
2341
color_mip_changed = true;
2342
2343
cmd_buffer->state.cb_mip[i] = iview->base_mip;
2344
}
2345
2346
if (color_mip_changed) {
2347
cmd_buffer->state.flush_bits |=
2348
RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2349
}
2350
}
2351
2352
/* This function does the flushes for mip changes if the levels are not zero for
2353
* all render targets. This way we can assume at the start of the next cmd_buffer
2354
* that rendering to mip 0 doesn't need any flushes. As that is the most common
2355
* case that saves some flushes. */
2356
static void
2357
radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
2358
{
2359
/* Entire workaround is not applicable before GFX9 */
2360
if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
2361
return;
2362
2363
bool need_color_mip_flush = false;
2364
for (unsigned i = 0; i < 8; ++i) {
2365
if (cmd_buffer->state.cb_mip[i]) {
2366
need_color_mip_flush = true;
2367
break;
2368
}
2369
}
2370
2371
if (need_color_mip_flush) {
2372
cmd_buffer->state.flush_bits |=
2373
RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
2374
}
2375
2376
memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
2377
}
2378
2379
static struct radv_image *
2380
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
2381
{
2382
struct radv_device *device = cmd_buffer->device;
2383
2384
if (!device->vrs.image) {
2385
VkResult result;
2386
2387
/* The global VRS image is created on-demand to avoid wasting space */
2388
result = radv_device_init_vrs_image(device);
2389
if (result != VK_SUCCESS) {
2390
cmd_buffer->record_result = result;
2391
return NULL;
2392
}
2393
}
2394
2395
return device->vrs.image;
2396
}
2397
2398
static void
2399
radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
2400
{
2401
int i;
2402
struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
2403
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2404
2405
/* this may happen for inherited secondary recording */
2406
if (!framebuffer)
2407
return;
2408
2409
for (i = 0; i < 8; ++i) {
2410
if (i >= subpass->color_count ||
2411
subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
2412
radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
2413
S_028C70_FORMAT(V_028C70_COLOR_INVALID));
2414
continue;
2415
}
2416
2417
int idx = subpass->color_attachments[i].attachment;
2418
struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2419
VkImageLayout layout = subpass->color_attachments[i].layout;
2420
bool in_render_loop = subpass->color_attachments[i].in_render_loop;
2421
2422
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo);
2423
2424
assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
2425
VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
2426
radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
2427
in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc);
2428
2429
radv_load_color_clear_metadata(cmd_buffer, iview, i);
2430
}
2431
2432
if (subpass->depth_stencil_attachment) {
2433
int idx = subpass->depth_stencil_attachment->attachment;
2434
VkImageLayout layout = subpass->depth_stencil_attachment->layout;
2435
bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
2436
struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
2437
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
2438
cmd_buffer->state.attachments[idx].iview->image->bo);
2439
2440
radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
2441
in_render_loop);
2442
2443
if (radv_layout_is_htile_compressed(
2444
cmd_buffer->device, iview->image, layout, in_render_loop,
2445
radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index,
2446
cmd_buffer->queue_family_index))) {
2447
/* Only load the depth/stencil fast clear values when
2448
* compressed rendering is enabled.
2449
*/
2450
radv_load_ds_clear_metadata(cmd_buffer, iview);
2451
}
2452
} else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) {
2453
/* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
2454
* bind our internal depth buffer that contains the VRS data as part of HTILE.
2455
*/
2456
VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
2457
struct radv_image *image = cmd_buffer->device->vrs.image;
2458
struct radv_ds_buffer_info ds;
2459
struct radv_image_view iview;
2460
2461
radv_image_view_init(&iview, cmd_buffer->device,
2462
&(VkImageViewCreateInfo){
2463
.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2464
.image = radv_image_to_handle(image),
2465
.viewType = radv_meta_get_view_type(image),
2466
.format = image->vk_format,
2467
.subresourceRange =
2468
{
2469
.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
2470
.baseMipLevel = 0,
2471
.levelCount = 1,
2472
.baseArrayLayer = 0,
2473
.layerCount = 1,
2474
},
2475
},
2476
NULL);
2477
2478
radv_initialise_ds_surface(cmd_buffer->device, &ds, &iview);
2479
2480
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, image->bo);
2481
2482
radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
2483
} else {
2484
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
2485
radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
2486
else
2487
radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
2488
2489
radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
2490
radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
2491
}
2492
radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
2493
S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
2494
2495
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
2496
bool disable_constant_encode =
2497
cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
2498
enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
2499
uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
2500
2501
radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
2502
S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
2503
S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
2504
S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
2505
}
2506
2507
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
2508
}
2509
2510
static void
2511
radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
2512
{
2513
struct radeon_cmdbuf *cs = cmd_buffer->cs;
2514
struct radv_cmd_state *state = &cmd_buffer->state;
2515
2516
if (state->index_type != state->last_index_type) {
2517
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2518
radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
2519
R_03090C_VGT_INDEX_TYPE, 2, state->index_type);
2520
} else {
2521
radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
2522
radeon_emit(cs, state->index_type);
2523
}
2524
2525
state->last_index_type = state->index_type;
2526
}
2527
2528
/* For the direct indexed draws we use DRAW_INDEX_2, which includes
2529
* the index_va and max_index_count already. */
2530
if (!indirect)
2531
return;
2532
2533
radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
2534
radeon_emit(cs, state->index_va);
2535
radeon_emit(cs, state->index_va >> 32);
2536
2537
radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
2538
radeon_emit(cs, state->max_index_count);
2539
2540
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
2541
}
2542
2543
void
2544
radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
2545
{
2546
bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
2547
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2548
uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
2549
uint32_t db_count_control;
2550
2551
if (!cmd_buffer->state.active_occlusion_queries) {
2552
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2553
if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2554
pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2555
/* Re-enable out-of-order rasterization if the
2556
* bound pipeline supports it and if it's has
2557
* been disabled before starting any perfect
2558
* occlusion queries.
2559
*/
2560
radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2561
}
2562
}
2563
db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
2564
} else {
2565
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
2566
uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
2567
bool gfx10_perfect =
2568
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
2569
2570
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
2571
/* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
2572
* covered tiles, discards, and early depth testing. For more details,
2573
* see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
2574
db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
2575
S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
2576
S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
2577
S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
2578
2579
if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
2580
pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
2581
/* If the bound pipeline has enabled
2582
* out-of-order rasterization, we should
2583
* disable it before starting any perfect
2584
* occlusion queries.
2585
*/
2586
pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
2587
2588
radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
2589
}
2590
} else {
2591
db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
2592
}
2593
}
2594
2595
radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
2596
2597
cmd_buffer->state.context_roll_without_scissor_emitted = true;
2598
}
2599
2600
static void
2601
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
2602
{
2603
uint64_t states =
2604
cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
2605
2606
if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
2607
radv_emit_viewport(cmd_buffer);
2608
2609
if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
2610
!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
2611
radv_emit_scissor(cmd_buffer);
2612
2613
if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
2614
radv_emit_line_width(cmd_buffer);
2615
2616
if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
2617
radv_emit_blend_constants(cmd_buffer);
2618
2619
if (states &
2620
(RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
2621
RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
2622
radv_emit_stencil(cmd_buffer);
2623
2624
if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
2625
radv_emit_depth_bounds(cmd_buffer);
2626
2627
if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
2628
radv_emit_depth_bias(cmd_buffer);
2629
2630
if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
2631
radv_emit_discard_rectangle(cmd_buffer);
2632
2633
if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
2634
radv_emit_sample_locations(cmd_buffer);
2635
2636
if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
2637
radv_emit_line_stipple(cmd_buffer);
2638
2639
if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
2640
RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
2641
radv_emit_culling(cmd_buffer, states);
2642
2643
if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
2644
radv_emit_primitive_topology(cmd_buffer);
2645
2646
if (states &
2647
(RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
2648
RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
2649
RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
2650
radv_emit_depth_control(cmd_buffer, states);
2651
2652
if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
2653
radv_emit_stencil_control(cmd_buffer);
2654
2655
if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
2656
radv_emit_fragment_shading_rate(cmd_buffer);
2657
2658
if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
2659
radv_emit_primitive_restart_enable(cmd_buffer);
2660
2661
if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
2662
radv_emit_rasterizer_discard_enable(cmd_buffer);
2663
2664
if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
2665
radv_emit_logic_op(cmd_buffer);
2666
2667
if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
2668
radv_emit_color_write_enable(cmd_buffer);
2669
2670
cmd_buffer->state.dirty &= ~states;
2671
}
2672
2673
static void
2674
radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
2675
{
2676
struct radv_descriptor_state *descriptors_state =
2677
radv_get_descriptors_state(cmd_buffer, bind_point);
2678
struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
2679
unsigned bo_offset;
2680
2681
if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
2682
&bo_offset))
2683
return;
2684
2685
set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2686
set->header.va += bo_offset;
2687
}
2688
2689
static void
2690
radv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
2691
struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
2692
{
2693
struct radv_descriptor_state *descriptors_state =
2694
radv_get_descriptors_state(cmd_buffer, bind_point);
2695
uint32_t size = MAX_SETS * 4;
2696
uint32_t offset;
2697
void *ptr;
2698
2699
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
2700
return;
2701
2702
for (unsigned i = 0; i < MAX_SETS; i++) {
2703
uint32_t *uptr = ((uint32_t *)ptr) + i;
2704
uint64_t set_va = 0;
2705
struct radv_descriptor_set *set = descriptors_state->sets[i];
2706
if (descriptors_state->valid & (1u << i))
2707
set_va = set->header.va;
2708
uptr[0] = set_va & 0xffffffff;
2709
}
2710
2711
uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2712
va += offset;
2713
2714
if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2715
if (pipeline->shaders[MESA_SHADER_VERTEX])
2716
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
2717
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2718
2719
if (pipeline->shaders[MESA_SHADER_FRAGMENT])
2720
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
2721
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2722
2723
if (radv_pipeline_has_gs(pipeline))
2724
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY,
2725
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2726
2727
if (radv_pipeline_has_tess(pipeline))
2728
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL,
2729
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2730
2731
if (radv_pipeline_has_tess(pipeline))
2732
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL,
2733
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2734
} else {
2735
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
2736
AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
2737
}
2738
}
2739
2740
static void
2741
radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
2742
struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
2743
{
2744
struct radv_descriptor_state *descriptors_state =
2745
radv_get_descriptors_state(cmd_buffer, bind_point);
2746
bool flush_indirect_descriptors;
2747
2748
if (!descriptors_state->dirty)
2749
return;
2750
2751
if (descriptors_state->push_dirty)
2752
radv_flush_push_descriptors(cmd_buffer, bind_point);
2753
2754
flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets;
2755
2756
if (flush_indirect_descriptors)
2757
radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
2758
2759
ASSERTED unsigned cdw_max =
2760
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4);
2761
2762
if (pipeline) {
2763
if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
2764
radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state,
2765
MESA_SHADER_COMPUTE);
2766
} else {
2767
radv_foreach_stage(stage, stages)
2768
{
2769
if (!cmd_buffer->state.pipeline->shaders[stage])
2770
continue;
2771
2772
radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage);
2773
}
2774
}
2775
}
2776
2777
descriptors_state->dirty = 0;
2778
descriptors_state->push_dirty = false;
2779
2780
assert(cmd_buffer->cs->cdw <= cdw_max);
2781
2782
if (unlikely(cmd_buffer->device->trace_bo))
2783
radv_save_descriptors(cmd_buffer, bind_point);
2784
}
2785
2786
static void
2787
radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
2788
struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
2789
{
2790
struct radv_descriptor_state *descriptors_state =
2791
radv_get_descriptors_state(cmd_buffer, bind_point);
2792
struct radv_pipeline_layout *layout = pipeline->layout;
2793
struct radv_shader_variant *shader, *prev_shader;
2794
bool need_push_constants = false;
2795
unsigned offset;
2796
void *ptr;
2797
uint64_t va;
2798
uint32_t internal_stages;
2799
uint32_t dirty_stages = 0;
2800
2801
stages &= cmd_buffer->push_constant_stages;
2802
if (!stages || (!layout->push_constant_size && !layout->dynamic_offset_count))
2803
return;
2804
2805
internal_stages = stages;
2806
switch (bind_point) {
2807
case VK_PIPELINE_BIND_POINT_GRAPHICS:
2808
break;
2809
case VK_PIPELINE_BIND_POINT_COMPUTE:
2810
dirty_stages = RADV_RT_STAGE_BITS;
2811
break;
2812
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
2813
internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
2814
dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
2815
break;
2816
default:
2817
unreachable("Unhandled bind point");
2818
}
2819
2820
radv_foreach_stage(stage, internal_stages)
2821
{
2822
shader = radv_get_shader(pipeline, stage);
2823
if (!shader)
2824
continue;
2825
2826
need_push_constants |= shader->info.loads_push_constants;
2827
need_push_constants |= shader->info.loads_dynamic_offsets;
2828
2829
uint8_t base = shader->info.base_inline_push_consts;
2830
uint8_t count = shader->info.num_inline_push_consts;
2831
2832
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, count,
2833
(uint32_t *)&cmd_buffer->push_constants[base * 4]);
2834
}
2835
2836
if (need_push_constants) {
2837
if (!radv_cmd_buffer_upload_alloc(
2838
cmd_buffer, layout->push_constant_size + 16 * layout->dynamic_offset_count, &offset,
2839
&ptr))
2840
return;
2841
2842
memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
2843
memcpy((char *)ptr + layout->push_constant_size, descriptors_state->dynamic_buffers,
2844
16 * layout->dynamic_offset_count);
2845
2846
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2847
va += offset;
2848
2849
ASSERTED unsigned cdw_max =
2850
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4);
2851
2852
prev_shader = NULL;
2853
radv_foreach_stage(stage, internal_stages)
2854
{
2855
shader = radv_get_shader(pipeline, stage);
2856
2857
/* Avoid redundantly emitting the address for merged stages. */
2858
if (shader && shader != prev_shader) {
2859
radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
2860
2861
prev_shader = shader;
2862
}
2863
}
2864
assert(cmd_buffer->cs->cdw <= cdw_max);
2865
}
2866
2867
cmd_buffer->push_constant_stages &= ~stages;
2868
cmd_buffer->push_constant_stages |= dirty_stages;
2869
}
2870
2871
static void
2872
radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
2873
{
2874
if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
2875
cmd_buffer->state.pipeline->vb_desc_usage_mask) {
2876
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2877
unsigned vb_offset;
2878
void *vb_ptr;
2879
unsigned desc_index = 0;
2880
uint32_t mask = pipeline->vb_desc_usage_mask;
2881
uint64_t va;
2882
2883
/* allocate some descriptor state for vertex buffers */
2884
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
2885
return;
2886
2887
while (mask) {
2888
unsigned i = u_bit_scan(&mask);
2889
uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
2890
uint32_t offset;
2891
unsigned binding = pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i;
2892
struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
2893
unsigned num_records;
2894
unsigned stride;
2895
2896
if (!buffer) {
2897
memset(desc, 0, 4 * 4);
2898
continue;
2899
}
2900
2901
va = radv_buffer_get_va(buffer->bo);
2902
2903
offset = cmd_buffer->vertex_bindings[binding].offset;
2904
va += offset + buffer->offset;
2905
2906
if (cmd_buffer->vertex_bindings[binding].size) {
2907
num_records = cmd_buffer->vertex_bindings[binding].size;
2908
} else {
2909
num_records = buffer->size - offset;
2910
}
2911
2912
if (pipeline->graphics.uses_dynamic_stride) {
2913
stride = cmd_buffer->vertex_bindings[binding].stride;
2914
} else {
2915
stride = pipeline->binding_stride[binding];
2916
}
2917
2918
enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
2919
if (pipeline->use_per_attribute_vb_descs) {
2920
uint32_t attrib_end = pipeline->attrib_ends[i];
2921
2922
if (num_records < attrib_end) {
2923
num_records = 0; /* not enough space for one vertex */
2924
} else if (stride == 0) {
2925
num_records = 1; /* only one vertex */
2926
} else {
2927
num_records = (num_records - attrib_end) / stride + 1;
2928
/* If attrib_offset>stride, then the compiler will increase the vertex index by
2929
* attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
2930
* only allowed with static strides.
2931
*/
2932
num_records += pipeline->attrib_index_offset[i];
2933
}
2934
2935
/* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
2936
* into bytes in that case. GFX8 always uses bytes.
2937
*/
2938
if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
2939
num_records = (num_records - 1) * stride + attrib_end;
2940
} else if (!num_records) {
2941
/* On GFX9, it seems bounds checking is disabled if both
2942
* num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
2943
* GFX10.3 but it doesn't hurt.
2944
*/
2945
memset(desc, 0, 16);
2946
continue;
2947
}
2948
} else {
2949
if (chip != GFX8 && stride)
2950
num_records = DIV_ROUND_UP(num_records, stride);
2951
}
2952
2953
uint32_t rsrc_word3 =
2954
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2955
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
2956
2957
if (chip >= GFX10) {
2958
/* OOB_SELECT chooses the out-of-bounds check:
2959
* - 1: index >= NUM_RECORDS (Structured)
2960
* - 3: offset >= NUM_RECORDS (Raw)
2961
*/
2962
int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
2963
2964
rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
2965
S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
2966
} else {
2967
rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
2968
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
2969
}
2970
2971
desc[0] = va;
2972
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
2973
desc[2] = num_records;
2974
desc[3] = rsrc_word3;
2975
}
2976
2977
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
2978
va += vb_offset;
2979
2980
radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS,
2981
va);
2982
2983
cmd_buffer->state.vb_va = va;
2984
cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
2985
2986
if (unlikely(cmd_buffer->device->trace_bo))
2987
radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
2988
}
2989
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
2990
}
2991
2992
static void
2993
radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
2994
{
2995
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
2996
struct radv_userdata_info *loc;
2997
uint32_t base_reg;
2998
2999
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
3000
if (!radv_get_shader(pipeline, stage))
3001
continue;
3002
3003
loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS);
3004
if (loc->sgpr_idx == -1)
3005
continue;
3006
3007
base_reg = pipeline->user_data_0[stage];
3008
3009
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
3010
false);
3011
}
3012
3013
if (radv_pipeline_has_gs_copy_shader(pipeline)) {
3014
loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
3015
if (loc->sgpr_idx != -1) {
3016
base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
3017
3018
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
3019
va, false);
3020
}
3021
}
3022
}
3023
3024
static void
3025
radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
3026
{
3027
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
3028
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
3029
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3030
unsigned so_offset;
3031
void *so_ptr;
3032
uint64_t va;
3033
3034
/* Allocate some descriptor state for streamout buffers. */
3035
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
3036
return;
3037
3038
for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
3039
struct radv_buffer *buffer = sb[i].buffer;
3040
uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
3041
3042
if (!(so->enabled_mask & (1 << i)))
3043
continue;
3044
3045
va = radv_buffer_get_va(buffer->bo) + buffer->offset;
3046
3047
va += sb[i].offset;
3048
3049
/* Set the descriptor.
3050
*
3051
* On GFX8, the format must be non-INVALID, otherwise
3052
* the buffer will be considered not bound and store
3053
* instructions will be no-ops.
3054
*/
3055
uint32_t size = 0xffffffff;
3056
3057
/* Compute the correct buffer size for NGG streamout
3058
* because it's used to determine the max emit per
3059
* buffer.
3060
*/
3061
if (cmd_buffer->device->physical_device->use_ngg_streamout)
3062
size = buffer->size - sb[i].offset;
3063
3064
uint32_t rsrc_word3 =
3065
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3066
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3067
3068
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
3069
rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
3070
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
3071
} else {
3072
rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3073
}
3074
3075
desc[0] = va;
3076
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
3077
desc[2] = size;
3078
desc[3] = rsrc_word3;
3079
}
3080
3081
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
3082
va += so_offset;
3083
3084
radv_emit_streamout_buffers(cmd_buffer, va);
3085
}
3086
3087
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
3088
}
3089
3090
static void
3091
radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
3092
{
3093
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
3094
struct radv_userdata_info *loc;
3095
uint32_t ngg_gs_state = 0;
3096
uint32_t base_reg;
3097
3098
if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg)
3099
return;
3100
3101
/* By default NGG GS queries are disabled but they are enabled if the
3102
* command buffer has active GDS queries or if it's a secondary command
3103
* buffer that inherits the number of generated primitives.
3104
*/
3105
if (cmd_buffer->state.active_pipeline_gds_queries ||
3106
(cmd_buffer->state.inherited_pipeline_statistics &
3107
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
3108
ngg_gs_state = 1;
3109
3110
loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE);
3111
base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
3112
assert(loc->sgpr_idx != -1);
3113
3114
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state);
3115
}
3116
3117
static void
3118
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
3119
{
3120
radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
3121
radv_flush_streamout_descriptors(cmd_buffer);
3122
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3123
VK_PIPELINE_BIND_POINT_GRAPHICS);
3124
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
3125
VK_PIPELINE_BIND_POINT_GRAPHICS);
3126
radv_flush_ngg_gs_state(cmd_buffer);
3127
}
3128
3129
struct radv_draw_info {
3130
/**
3131
* Number of vertices.
3132
*/
3133
uint32_t count;
3134
3135
/**
3136
* First instance id.
3137
*/
3138
uint32_t first_instance;
3139
3140
/**
3141
* Number of instances.
3142
*/
3143
uint32_t instance_count;
3144
3145
/**
3146
* Whether it's an indexed draw.
3147
*/
3148
bool indexed;
3149
3150
/**
3151
* Indirect draw parameters resource.
3152
*/
3153
struct radv_buffer *indirect;
3154
uint64_t indirect_offset;
3155
uint32_t stride;
3156
3157
/**
3158
* Draw count parameters resource.
3159
*/
3160
struct radv_buffer *count_buffer;
3161
uint64_t count_buffer_offset;
3162
3163
/**
3164
* Stream output parameters resource.
3165
*/
3166
struct radv_buffer *strmout_buffer;
3167
uint64_t strmout_buffer_offset;
3168
};
3169
3170
static uint32_t
3171
radv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
3172
{
3173
switch (cmd_buffer->state.index_type) {
3174
case V_028A7C_VGT_INDEX_8:
3175
return 0xffu;
3176
case V_028A7C_VGT_INDEX_16:
3177
return 0xffffu;
3178
case V_028A7C_VGT_INDEX_32:
3179
return 0xffffffffu;
3180
default:
3181
unreachable("invalid index type");
3182
}
3183
}
3184
3185
static void
3186
si_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
3187
bool indirect_draw, bool count_from_stream_output,
3188
uint32_t draw_vertex_count)
3189
{
3190
struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3191
struct radv_cmd_state *state = &cmd_buffer->state;
3192
unsigned topology = state->dynamic.primitive_topology;
3193
bool prim_restart_enable = state->dynamic.primitive_restart_enable;
3194
struct radeon_cmdbuf *cs = cmd_buffer->cs;
3195
unsigned ia_multi_vgt_param;
3196
3197
ia_multi_vgt_param =
3198
si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
3199
draw_vertex_count, topology, prim_restart_enable);
3200
3201
if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
3202
if (info->chip_class == GFX9) {
3203
radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
3204
R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
3205
} else if (info->chip_class >= GFX7) {
3206
radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
3207
} else {
3208
radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
3209
}
3210
state->last_ia_multi_vgt_param = ia_multi_vgt_param;
3211
}
3212
}
3213
3214
static void
3215
radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
3216
{
3217
struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
3218
struct radv_cmd_state *state = &cmd_buffer->state;
3219
struct radeon_cmdbuf *cs = cmd_buffer->cs;
3220
3221
/* Draw state. */
3222
if (info->chip_class < GFX10) {
3223
si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
3224
!!draw_info->strmout_buffer,
3225
draw_info->indirect ? 0 : draw_info->count);
3226
}
3227
3228
if (state->dynamic.primitive_restart_enable) {
3229
uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3230
3231
if (primitive_reset_index != state->last_primitive_reset_index) {
3232
radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
3233
state->last_primitive_reset_index = primitive_reset_index;
3234
}
3235
}
3236
3237
if (draw_info->strmout_buffer) {
3238
uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
3239
3240
va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
3241
3242
radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
3243
3244
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
3245
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
3246
COPY_DATA_WR_CONFIRM);
3247
radeon_emit(cs, va);
3248
radeon_emit(cs, va >> 32);
3249
radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
3250
radeon_emit(cs, 0); /* unused */
3251
3252
radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
3253
}
3254
}
3255
3256
static void
3257
radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask)
3258
{
3259
if (src_stage_mask &
3260
(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT |
3261
VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3262
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
3263
}
3264
3265
if (src_stage_mask &
3266
(VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
3267
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
3268
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
3269
VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
3270
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
3271
} else if (src_stage_mask &
3272
(VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
3273
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
3274
VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
3275
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
3276
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
3277
VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
3278
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
3279
}
3280
}
3281
3282
enum radv_cmd_flush_bits
3283
radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags,
3284
const struct radv_image *image)
3285
{
3286
bool has_CB_meta = true, has_DB_meta = true;
3287
bool image_is_coherent = image ? image->l2_coherent : false;
3288
enum radv_cmd_flush_bits flush_bits = 0;
3289
3290
if (image) {
3291
if (!radv_image_has_CB_metadata(image))
3292
has_CB_meta = false;
3293
if (!radv_image_has_htile(image))
3294
has_DB_meta = false;
3295
}
3296
3297
u_foreach_bit(b, src_flags)
3298
{
3299
switch ((VkAccessFlagBits)(1 << b)) {
3300
case VK_ACCESS_SHADER_WRITE_BIT:
3301
/* since the STORAGE bit isn't set we know that this is a meta operation.
3302
* on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
3303
* set it here. */
3304
if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3305
if (vk_format_is_depth_or_stencil(image->vk_format)) {
3306
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3307
} else {
3308
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3309
}
3310
}
3311
if (!image_is_coherent)
3312
flush_bits |= RADV_CMD_FLAG_WB_L2;
3313
break;
3314
case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3315
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3316
if (!image_is_coherent)
3317
flush_bits |= RADV_CMD_FLAG_WB_L2;
3318
break;
3319
case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3320
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3321
if (has_CB_meta)
3322
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3323
break;
3324
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3325
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3326
if (has_DB_meta)
3327
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3328
break;
3329
case VK_ACCESS_TRANSFER_WRITE_BIT:
3330
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3331
3332
if (!image_is_coherent)
3333
flush_bits |= RADV_CMD_FLAG_INV_L2;
3334
if (has_CB_meta)
3335
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3336
if (has_DB_meta)
3337
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3338
break;
3339
case VK_ACCESS_MEMORY_WRITE_BIT:
3340
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3341
3342
if (!image_is_coherent)
3343
flush_bits |= RADV_CMD_FLAG_INV_L2;
3344
if (has_CB_meta)
3345
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3346
if (has_DB_meta)
3347
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3348
break;
3349
default:
3350
break;
3351
}
3352
}
3353
return flush_bits;
3354
}
3355
3356
enum radv_cmd_flush_bits
3357
radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags,
3358
const struct radv_image *image)
3359
{
3360
bool has_CB_meta = true, has_DB_meta = true;
3361
enum radv_cmd_flush_bits flush_bits = 0;
3362
bool flush_CB = true, flush_DB = true;
3363
bool image_is_coherent = image ? image->l2_coherent : false;
3364
3365
if (image) {
3366
if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
3367
flush_CB = false;
3368
flush_DB = false;
3369
}
3370
3371
if (!radv_image_has_CB_metadata(image))
3372
has_CB_meta = false;
3373
if (!radv_image_has_htile(image))
3374
has_DB_meta = false;
3375
}
3376
3377
u_foreach_bit(b, dst_flags)
3378
{
3379
switch ((VkAccessFlagBits)(1 << b)) {
3380
case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
3381
case VK_ACCESS_INDEX_READ_BIT:
3382
case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3383
break;
3384
case VK_ACCESS_UNIFORM_READ_BIT:
3385
flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3386
break;
3387
case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
3388
case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
3389
case VK_ACCESS_TRANSFER_READ_BIT:
3390
case VK_ACCESS_TRANSFER_WRITE_BIT:
3391
flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3392
3393
if (has_CB_meta || has_DB_meta)
3394
flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3395
if (!image_is_coherent)
3396
flush_bits |= RADV_CMD_FLAG_INV_L2;
3397
break;
3398
case VK_ACCESS_SHADER_READ_BIT:
3399
flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
3400
/* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
3401
* invalidate the scalar cache. */
3402
if (!cmd_buffer->device->physical_device->use_llvm && !image)
3403
flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
3404
3405
if (has_CB_meta || has_DB_meta)
3406
flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
3407
if (!image_is_coherent)
3408
flush_bits |= RADV_CMD_FLAG_INV_L2;
3409
break;
3410
case VK_ACCESS_SHADER_WRITE_BIT:
3411
break;
3412
case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
3413
case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
3414
if (flush_CB)
3415
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3416
if (has_CB_meta)
3417
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3418
break;
3419
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
3420
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3421
if (flush_DB)
3422
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3423
if (has_DB_meta)
3424
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3425
break;
3426
case VK_ACCESS_MEMORY_READ_BIT:
3427
case VK_ACCESS_MEMORY_WRITE_BIT:
3428
flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
3429
if (!image_is_coherent)
3430
flush_bits |= RADV_CMD_FLAG_INV_L2;
3431
if (flush_CB)
3432
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
3433
if (has_CB_meta)
3434
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
3435
if (flush_DB)
3436
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
3437
if (has_DB_meta)
3438
flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
3439
break;
3440
default:
3441
break;
3442
}
3443
}
3444
return flush_bits;
3445
}
3446
3447
void
3448
radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
3449
{
3450
struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
3451
if (fb && !fb->imageless) {
3452
for (int i = 0; i < fb->attachment_count; ++i) {
3453
cmd_buffer->state.flush_bits |=
3454
radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image);
3455
}
3456
} else {
3457
cmd_buffer->state.flush_bits |=
3458
radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL);
3459
}
3460
3461
radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
3462
3463
if (fb && !fb->imageless) {
3464
for (int i = 0; i < fb->attachment_count; ++i) {
3465
cmd_buffer->state.flush_bits |=
3466
radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image);
3467
}
3468
} else {
3469
cmd_buffer->state.flush_bits |=
3470
radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL);
3471
}
3472
}
3473
3474
uint32_t
3475
radv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
3476
{
3477
struct radv_cmd_state *state = &cmd_buffer->state;
3478
uint32_t subpass_id = state->subpass - state->pass->subpasses;
3479
3480
/* The id of this subpass shouldn't exceed the number of subpasses in
3481
* this render pass minus 1.
3482
*/
3483
assert(subpass_id < state->pass->subpass_count);
3484
return subpass_id;
3485
}
3486
3487
static struct radv_sample_locations_state *
3488
radv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
3489
bool begin_subpass)
3490
{
3491
struct radv_cmd_state *state = &cmd_buffer->state;
3492
uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
3493
struct radv_image_view *view = state->attachments[att_idx].iview;
3494
3495
if (view->image->info.samples == 1)
3496
return NULL;
3497
3498
if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
3499
/* Return the initial sample locations if this is the initial
3500
* layout transition of the given subpass attachemnt.
3501
*/
3502
if (state->attachments[att_idx].sample_location.count > 0)
3503
return &state->attachments[att_idx].sample_location;
3504
} else {
3505
/* Otherwise return the subpass sample locations if defined. */
3506
if (state->subpass_sample_locs) {
3507
/* Because the driver sets the current subpass before
3508
* initial layout transitions, we should use the sample
3509
* locations from the previous subpass to avoid an
3510
* off-by-one problem. Otherwise, use the sample
3511
* locations for the current subpass for final layout
3512
* transitions.
3513
*/
3514
if (begin_subpass)
3515
subpass_id--;
3516
3517
for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
3518
if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
3519
return &state->subpass_sample_locs[i].sample_location;
3520
}
3521
}
3522
}
3523
3524
return NULL;
3525
}
3526
3527
static void
3528
radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
3529
struct radv_subpass_attachment att, bool begin_subpass)
3530
{
3531
unsigned idx = att.attachment;
3532
struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
3533
struct radv_sample_locations_state *sample_locs;
3534
VkImageSubresourceRange range;
3535
range.aspectMask = view->aspect_mask;
3536
range.baseMipLevel = view->base_mip;
3537
range.levelCount = 1;
3538
range.baseArrayLayer = view->base_layer;
3539
range.layerCount = cmd_buffer->state.framebuffer->layers;
3540
3541
if (cmd_buffer->state.subpass->view_mask) {
3542
/* If the current subpass uses multiview, the driver might have
3543
* performed a fast color/depth clear to the whole image
3544
* (including all layers). To make sure the driver will
3545
* decompress the image correctly (if needed), we have to
3546
* account for the "real" number of layers. If the view mask is
3547
* sparse, this will decompress more layers than needed.
3548
*/
3549
range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
3550
}
3551
3552
/* Get the subpass sample locations for the given attachment, if NULL
3553
* is returned the driver will use the default HW locations.
3554
*/
3555
sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
3556
3557
/* Determine if the subpass uses separate depth/stencil layouts. */
3558
bool uses_separate_depth_stencil_layouts = false;
3559
if ((cmd_buffer->state.attachments[idx].current_layout !=
3560
cmd_buffer->state.attachments[idx].current_stencil_layout) ||
3561
(att.layout != att.stencil_layout)) {
3562
uses_separate_depth_stencil_layouts = true;
3563
}
3564
3565
/* For separate layouts, perform depth and stencil transitions
3566
* separately.
3567
*/
3568
if (uses_separate_depth_stencil_layouts &&
3569
(range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
3570
/* Depth-only transitions. */
3571
range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
3572
radv_handle_image_transition(cmd_buffer, view->image,
3573
cmd_buffer->state.attachments[idx].current_layout,
3574
cmd_buffer->state.attachments[idx].current_in_render_loop,
3575
att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
3576
3577
/* Stencil-only transitions. */
3578
range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
3579
radv_handle_image_transition(
3580
cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
3581
cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
3582
att.in_render_loop, 0, 0, &range, sample_locs);
3583
} else {
3584
radv_handle_image_transition(cmd_buffer, view->image,
3585
cmd_buffer->state.attachments[idx].current_layout,
3586
cmd_buffer->state.attachments[idx].current_in_render_loop,
3587
att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
3588
}
3589
3590
cmd_buffer->state.attachments[idx].current_layout = att.layout;
3591
cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
3592
cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
3593
}
3594
3595
void
3596
radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
3597
{
3598
cmd_buffer->state.subpass = subpass;
3599
3600
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
3601
}
3602
3603
static VkResult
3604
radv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
3605
struct radv_render_pass *pass,
3606
const VkRenderPassBeginInfo *info)
3607
{
3608
const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
3609
vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
3610
struct radv_cmd_state *state = &cmd_buffer->state;
3611
3612
if (!sample_locs) {
3613
state->subpass_sample_locs = NULL;
3614
return VK_SUCCESS;
3615
}
3616
3617
for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
3618
const VkAttachmentSampleLocationsEXT *att_sample_locs =
3619
&sample_locs->pAttachmentInitialSampleLocations[i];
3620
uint32_t att_idx = att_sample_locs->attachmentIndex;
3621
struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
3622
3623
assert(vk_format_is_depth_or_stencil(image->vk_format));
3624
3625
/* From the Vulkan spec 1.1.108:
3626
*
3627
* "If the image referenced by the framebuffer attachment at
3628
* index attachmentIndex was not created with
3629
* VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
3630
* then the values specified in sampleLocationsInfo are
3631
* ignored."
3632
*/
3633
if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
3634
continue;
3635
3636
const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
3637
3638
state->attachments[att_idx].sample_location.per_pixel =
3639
sample_locs_info->sampleLocationsPerPixel;
3640
state->attachments[att_idx].sample_location.grid_size =
3641
sample_locs_info->sampleLocationGridSize;
3642
state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
3643
typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
3644
sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
3645
}
3646
3647
state->subpass_sample_locs =
3648
vk_alloc(&cmd_buffer->pool->alloc,
3649
sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
3650
8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3651
if (state->subpass_sample_locs == NULL) {
3652
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3653
return cmd_buffer->record_result;
3654
}
3655
3656
state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
3657
3658
for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
3659
const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
3660
&sample_locs->pPostSubpassSampleLocations[i];
3661
const VkSampleLocationsInfoEXT *sample_locs_info =
3662
&subpass_sample_locs_info->sampleLocationsInfo;
3663
3664
state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
3665
state->subpass_sample_locs[i].sample_location.per_pixel =
3666
sample_locs_info->sampleLocationsPerPixel;
3667
state->subpass_sample_locs[i].sample_location.grid_size =
3668
sample_locs_info->sampleLocationGridSize;
3669
state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
3670
typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
3671
sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
3672
}
3673
3674
return VK_SUCCESS;
3675
}
3676
3677
static VkResult
3678
radv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
3679
const VkRenderPassBeginInfo *info,
3680
const struct radv_extra_render_pass_begin_info *extra)
3681
{
3682
struct radv_cmd_state *state = &cmd_buffer->state;
3683
const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
3684
3685
if (info) {
3686
attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
3687
}
3688
3689
if (pass->attachment_count == 0) {
3690
state->attachments = NULL;
3691
return VK_SUCCESS;
3692
}
3693
3694
state->attachments =
3695
vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8,
3696
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3697
if (state->attachments == NULL) {
3698
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3699
return cmd_buffer->record_result;
3700
}
3701
3702
for (uint32_t i = 0; i < pass->attachment_count; ++i) {
3703
struct radv_render_pass_attachment *att = &pass->attachments[i];
3704
VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
3705
VkImageAspectFlags clear_aspects = 0;
3706
3707
if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
3708
/* color attachment */
3709
if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3710
clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
3711
}
3712
} else {
3713
/* depthstencil attachment */
3714
if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
3715
att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3716
clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
3717
if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3718
att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
3719
clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3720
}
3721
if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
3722
att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
3723
clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
3724
}
3725
}
3726
3727
state->attachments[i].pending_clear_aspects = clear_aspects;
3728
state->attachments[i].cleared_views = 0;
3729
if (clear_aspects && info) {
3730
assert(info->clearValueCount > i);
3731
state->attachments[i].clear_value = info->pClearValues[i];
3732
}
3733
3734
state->attachments[i].current_layout = att->initial_layout;
3735
state->attachments[i].current_in_render_loop = false;
3736
state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
3737
state->attachments[i].disable_dcc = extra && extra->disable_dcc;
3738
state->attachments[i].sample_location.count = 0;
3739
3740
struct radv_image_view *iview;
3741
if (attachment_info && attachment_info->attachmentCount > i) {
3742
iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
3743
} else {
3744
iview = state->framebuffer->attachments[i];
3745
}
3746
3747
state->attachments[i].iview = iview;
3748
if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3749
radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
3750
} else {
3751
radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
3752
}
3753
}
3754
3755
return VK_SUCCESS;
3756
}
3757
3758
VkResult
3759
radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
3760
VkCommandBuffer *pCommandBuffers)
3761
{
3762
RADV_FROM_HANDLE(radv_device, device, _device);
3763
RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
3764
3765
VkResult result = VK_SUCCESS;
3766
uint32_t i;
3767
3768
for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
3769
3770
if (!list_is_empty(&pool->free_cmd_buffers)) {
3771
struct radv_cmd_buffer *cmd_buffer =
3772
list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
3773
3774
list_del(&cmd_buffer->pool_link);
3775
list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
3776
3777
result = radv_reset_cmd_buffer(cmd_buffer);
3778
cmd_buffer->level = pAllocateInfo->level;
3779
vk_object_base_reset(&cmd_buffer->base);
3780
3781
pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
3782
} else {
3783
result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
3784
}
3785
if (result != VK_SUCCESS)
3786
break;
3787
}
3788
3789
if (result != VK_SUCCESS) {
3790
radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
3791
3792
/* From the Vulkan 1.0.66 spec:
3793
*
3794
* "vkAllocateCommandBuffers can be used to create multiple
3795
* command buffers. If the creation of any of those command
3796
* buffers fails, the implementation must destroy all
3797
* successfully created command buffer objects from this
3798
* command, set all entries of the pCommandBuffers array to
3799
* NULL and return the error."
3800
*/
3801
memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
3802
}
3803
3804
return result;
3805
}
3806
3807
void
3808
radv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
3809
const VkCommandBuffer *pCommandBuffers)
3810
{
3811
for (uint32_t i = 0; i < commandBufferCount; i++) {
3812
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
3813
3814
if (cmd_buffer) {
3815
if (cmd_buffer->pool) {
3816
list_del(&cmd_buffer->pool_link);
3817
list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
3818
} else
3819
radv_destroy_cmd_buffer(cmd_buffer);
3820
}
3821
}
3822
}
3823
3824
VkResult
3825
radv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
3826
{
3827
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3828
return radv_reset_cmd_buffer(cmd_buffer);
3829
}
3830
3831
VkResult
3832
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
3833
{
3834
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3835
VkResult result = VK_SUCCESS;
3836
3837
if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
3838
/* If the command buffer has already been resetted with
3839
* vkResetCommandBuffer, no need to do it again.
3840
*/
3841
result = radv_reset_cmd_buffer(cmd_buffer);
3842
if (result != VK_SUCCESS)
3843
return result;
3844
}
3845
3846
memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
3847
cmd_buffer->state.last_primitive_reset_en = -1;
3848
cmd_buffer->state.last_index_type = -1;
3849
cmd_buffer->state.last_num_instances = -1;
3850
cmd_buffer->state.last_vertex_offset = -1;
3851
cmd_buffer->state.last_first_instance = -1;
3852
cmd_buffer->state.last_drawid = -1;
3853
cmd_buffer->state.predication_type = -1;
3854
cmd_buffer->state.last_sx_ps_downconvert = -1;
3855
cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
3856
cmd_buffer->state.last_sx_blend_opt_control = -1;
3857
cmd_buffer->state.last_nggc_settings = -1;
3858
cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
3859
cmd_buffer->usage_flags = pBeginInfo->flags;
3860
3861
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
3862
(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
3863
assert(pBeginInfo->pInheritanceInfo);
3864
cmd_buffer->state.framebuffer =
3865
radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
3866
cmd_buffer->state.pass =
3867
radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
3868
3869
struct radv_subpass *subpass =
3870
&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
3871
3872
if (cmd_buffer->state.framebuffer) {
3873
result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL);
3874
if (result != VK_SUCCESS)
3875
return result;
3876
}
3877
3878
cmd_buffer->state.inherited_pipeline_statistics =
3879
pBeginInfo->pInheritanceInfo->pipelineStatistics;
3880
3881
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
3882
}
3883
3884
if (unlikely(cmd_buffer->device->trace_bo))
3885
radv_cmd_buffer_trace_emit(cmd_buffer);
3886
3887
radv_describe_begin_cmd_buffer(cmd_buffer);
3888
3889
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
3890
3891
return result;
3892
}
3893
3894
void
3895
radv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3896
uint32_t bindingCount, const VkBuffer *pBuffers,
3897
const VkDeviceSize *pOffsets)
3898
{
3899
radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets,
3900
NULL, NULL);
3901
}
3902
3903
void
3904
radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3905
uint32_t bindingCount, const VkBuffer *pBuffers,
3906
const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
3907
const VkDeviceSize *pStrides)
3908
{
3909
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3910
struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
3911
bool changed = false;
3912
3913
/* We have to defer setting up vertex buffer since we need the buffer
3914
* stride from the pipeline. */
3915
3916
assert(firstBinding + bindingCount <= MAX_VBS);
3917
for (uint32_t i = 0; i < bindingCount; i++) {
3918
RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
3919
uint32_t idx = firstBinding + i;
3920
VkDeviceSize size = pSizes ? pSizes[i] : 0;
3921
VkDeviceSize stride = pStrides ? pStrides[i] : 0;
3922
3923
/* pSizes and pStrides are optional. */
3924
if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] ||
3925
vb[idx].size != size || vb[idx].stride != stride)) {
3926
changed = true;
3927
}
3928
3929
vb[idx].buffer = buffer;
3930
vb[idx].offset = pOffsets[i];
3931
vb[idx].size = size;
3932
vb[idx].stride = stride;
3933
3934
if (buffer) {
3935
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo);
3936
}
3937
}
3938
3939
if (!changed) {
3940
/* No state changes. */
3941
return;
3942
}
3943
3944
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
3945
}
3946
3947
static uint32_t
3948
vk_to_index_type(VkIndexType type)
3949
{
3950
switch (type) {
3951
case VK_INDEX_TYPE_UINT8_EXT:
3952
return V_028A7C_VGT_INDEX_8;
3953
case VK_INDEX_TYPE_UINT16:
3954
return V_028A7C_VGT_INDEX_16;
3955
case VK_INDEX_TYPE_UINT32:
3956
return V_028A7C_VGT_INDEX_32;
3957
default:
3958
unreachable("invalid index type");
3959
}
3960
}
3961
3962
static uint32_t
3963
radv_get_vgt_index_size(uint32_t type)
3964
{
3965
switch (type) {
3966
case V_028A7C_VGT_INDEX_8:
3967
return 1;
3968
case V_028A7C_VGT_INDEX_16:
3969
return 2;
3970
case V_028A7C_VGT_INDEX_32:
3971
return 4;
3972
default:
3973
unreachable("invalid index type");
3974
}
3975
}
3976
3977
void
3978
radv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
3979
VkIndexType indexType)
3980
{
3981
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
3982
RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
3983
3984
if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset &&
3985
cmd_buffer->state.index_type == indexType) {
3986
/* No state changes. */
3987
return;
3988
}
3989
3990
cmd_buffer->state.index_buffer = index_buffer;
3991
cmd_buffer->state.index_offset = offset;
3992
cmd_buffer->state.index_type = vk_to_index_type(indexType);
3993
cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
3994
cmd_buffer->state.index_va += index_buffer->offset + offset;
3995
3996
int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
3997
cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
3998
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
3999
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
4000
}
4001
4002
static void
4003
radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
4004
struct radv_descriptor_set *set, unsigned idx)
4005
{
4006
struct radeon_winsys *ws = cmd_buffer->device->ws;
4007
4008
radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
4009
4010
assert(set);
4011
assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
4012
4013
if (!cmd_buffer->device->use_global_bo_list) {
4014
for (unsigned j = 0; j < set->header.buffer_count; ++j)
4015
if (set->descriptors[j])
4016
radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
4017
}
4018
4019
if (set->header.bo)
4020
radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
4021
}
4022
4023
void
4024
radv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4025
VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
4026
const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
4027
const uint32_t *pDynamicOffsets)
4028
{
4029
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4030
RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4031
unsigned dyn_idx = 0;
4032
4033
const bool no_dynamic_bounds =
4034
cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
4035
struct radv_descriptor_state *descriptors_state =
4036
radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4037
4038
for (unsigned i = 0; i < descriptorSetCount; ++i) {
4039
unsigned set_idx = i + firstSet;
4040
RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
4041
4042
/* If the set is already bound we only need to update the
4043
* (potentially changed) dynamic offsets. */
4044
if (descriptors_state->sets[set_idx] != set ||
4045
!(descriptors_state->valid & (1u << set_idx))) {
4046
radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
4047
}
4048
4049
for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
4050
unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
4051
uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
4052
assert(dyn_idx < dynamicOffsetCount);
4053
4054
struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
4055
4056
if (!range->va) {
4057
memset(dst, 0, 4 * 4);
4058
} else {
4059
uint64_t va = range->va + pDynamicOffsets[dyn_idx];
4060
dst[0] = va;
4061
dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
4062
dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
4063
dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4064
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4065
4066
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
4067
dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
4068
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
4069
} else {
4070
dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4071
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4072
}
4073
}
4074
4075
cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
4076
}
4077
}
4078
}
4079
4080
static bool
4081
radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
4082
struct radv_descriptor_set_layout *layout,
4083
VkPipelineBindPoint bind_point)
4084
{
4085
struct radv_descriptor_state *descriptors_state =
4086
radv_get_descriptors_state(cmd_buffer, bind_point);
4087
set->header.size = layout->size;
4088
set->header.layout = layout;
4089
4090
if (descriptors_state->push_set.capacity < set->header.size) {
4091
size_t new_size = MAX2(set->header.size, 1024);
4092
new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
4093
new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
4094
4095
free(set->header.mapped_ptr);
4096
set->header.mapped_ptr = malloc(new_size);
4097
4098
if (!set->header.mapped_ptr) {
4099
descriptors_state->push_set.capacity = 0;
4100
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
4101
return false;
4102
}
4103
4104
descriptors_state->push_set.capacity = new_size;
4105
}
4106
4107
return true;
4108
}
4109
4110
void
4111
radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
4112
VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
4113
uint32_t set, uint32_t descriptorWriteCount,
4114
const VkWriteDescriptorSet *pDescriptorWrites)
4115
{
4116
RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4117
struct radv_descriptor_set *push_set =
4118
(struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
4119
unsigned bo_offset;
4120
4121
assert(set == 0);
4122
assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4123
4124
push_set->header.size = layout->set[set].layout->size;
4125
push_set->header.layout = layout->set[set].layout;
4126
4127
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
4128
(void **)&push_set->header.mapped_ptr))
4129
return;
4130
4131
push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
4132
push_set->header.va += bo_offset;
4133
4134
radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4135
radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4136
pDescriptorWrites, 0, NULL);
4137
4138
radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4139
}
4140
4141
void
4142
radv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4143
VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
4144
const VkWriteDescriptorSet *pDescriptorWrites)
4145
{
4146
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4147
RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4148
struct radv_descriptor_state *descriptors_state =
4149
radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
4150
struct radv_descriptor_set *push_set =
4151
(struct radv_descriptor_set *)&descriptors_state->push_set.set;
4152
4153
assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4154
4155
if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4156
pipelineBindPoint))
4157
return;
4158
4159
/* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
4160
* because it is invalid, according to Vulkan spec.
4161
*/
4162
for (int i = 0; i < descriptorWriteCount; i++) {
4163
ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
4164
assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
4165
}
4166
4167
radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
4168
radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
4169
pDescriptorWrites, 0, NULL);
4170
4171
radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
4172
descriptors_state->push_dirty = true;
4173
}
4174
4175
void
4176
radv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
4177
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
4178
VkPipelineLayout _layout, uint32_t set, const void *pData)
4179
{
4180
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4181
RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
4182
RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
4183
struct radv_descriptor_state *descriptors_state =
4184
radv_get_descriptors_state(cmd_buffer, templ->bind_point);
4185
struct radv_descriptor_set *push_set =
4186
(struct radv_descriptor_set *)&descriptors_state->push_set.set;
4187
4188
assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
4189
4190
if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
4191
templ->bind_point))
4192
return;
4193
4194
radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
4195
descriptorUpdateTemplate, pData);
4196
4197
radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
4198
descriptors_state->push_dirty = true;
4199
}
4200
4201
void
4202
radv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
4203
VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
4204
const void *pValues)
4205
{
4206
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4207
memcpy(cmd_buffer->push_constants + offset, pValues, size);
4208
cmd_buffer->push_constant_stages |= stageFlags;
4209
}
4210
4211
VkResult
4212
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
4213
{
4214
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4215
4216
radv_emit_mip_change_flush_default(cmd_buffer);
4217
4218
if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
4219
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
4220
cmd_buffer->state.flush_bits |=
4221
RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
4222
4223
/* Make sure to sync all pending active queries at the end of
4224
* command buffer.
4225
*/
4226
cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
4227
4228
/* Since NGG streamout uses GDS, we need to make GDS idle when
4229
* we leave the IB, otherwise another process might overwrite
4230
* it while our shaders are busy.
4231
*/
4232
if (cmd_buffer->gds_needed)
4233
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
4234
4235
si_emit_cache_flush(cmd_buffer);
4236
}
4237
4238
/* Make sure CP DMA is idle at the end of IBs because the kernel
4239
* doesn't wait for it.
4240
*/
4241
si_cp_dma_wait_for_idle(cmd_buffer);
4242
4243
radv_describe_end_cmd_buffer(cmd_buffer);
4244
4245
vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4246
vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4247
4248
VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
4249
if (result != VK_SUCCESS)
4250
return vk_error(cmd_buffer->device->instance, result);
4251
4252
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
4253
4254
return cmd_buffer->record_result;
4255
}
4256
4257
static void
4258
radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
4259
{
4260
if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
4261
return;
4262
4263
assert(!pipeline->ctx_cs.cdw);
4264
4265
cmd_buffer->state.emitted_compute_pipeline = pipeline;
4266
4267
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
4268
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
4269
4270
cmd_buffer->compute_scratch_size_per_wave_needed =
4271
MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
4272
cmd_buffer->compute_scratch_waves_wanted =
4273
MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
4274
4275
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
4276
pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
4277
4278
if (unlikely(cmd_buffer->device->trace_bo))
4279
radv_save_pipeline(cmd_buffer, pipeline);
4280
}
4281
4282
static void
4283
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
4284
{
4285
struct radv_descriptor_state *descriptors_state =
4286
radv_get_descriptors_state(cmd_buffer, bind_point);
4287
4288
descriptors_state->dirty |= descriptors_state->valid;
4289
}
4290
4291
void
4292
radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
4293
VkPipeline _pipeline)
4294
{
4295
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4296
RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
4297
4298
switch (pipelineBindPoint) {
4299
case VK_PIPELINE_BIND_POINT_COMPUTE:
4300
if (cmd_buffer->state.compute_pipeline == pipeline)
4301
return;
4302
radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4303
4304
cmd_buffer->state.compute_pipeline = pipeline;
4305
cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
4306
break;
4307
case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
4308
if (cmd_buffer->state.rt_pipeline == pipeline)
4309
return;
4310
radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4311
4312
cmd_buffer->state.rt_pipeline = pipeline;
4313
cmd_buffer->push_constant_stages |=
4314
(VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
4315
VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
4316
VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR);
4317
break;
4318
case VK_PIPELINE_BIND_POINT_GRAPHICS:
4319
if (cmd_buffer->state.pipeline == pipeline)
4320
return;
4321
radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
4322
4323
bool vtx_emit_count_changed =
4324
!pipeline || !cmd_buffer->state.pipeline ||
4325
cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num ||
4326
cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr;
4327
cmd_buffer->state.pipeline = pipeline;
4328
if (!pipeline)
4329
break;
4330
4331
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
4332
cmd_buffer->push_constant_stages |= pipeline->active_stages;
4333
4334
/* the new vertex shader might not have the same user regs */
4335
if (vtx_emit_count_changed) {
4336
cmd_buffer->state.last_first_instance = -1;
4337
cmd_buffer->state.last_vertex_offset = -1;
4338
cmd_buffer->state.last_drawid = -1;
4339
}
4340
4341
/* Prefetch all pipeline shaders at first draw time. */
4342
cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
4343
4344
if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
4345
cmd_buffer->state.emitted_pipeline &&
4346
cmd_buffer->state.emitted_pipeline->graphics.is_ngg &&
4347
!cmd_buffer->state.pipeline->graphics.is_ngg) {
4348
/* Transitioning from NGG to legacy GS requires
4349
* VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH
4350
* is also emitted at the beginning of IBs when legacy
4351
* GS ring pointers are set.
4352
*/
4353
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
4354
}
4355
4356
radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
4357
radv_bind_streamout_state(cmd_buffer, pipeline);
4358
4359
if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
4360
cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
4361
if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
4362
cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
4363
4364
if (radv_pipeline_has_tess(pipeline))
4365
cmd_buffer->tess_rings_needed = true;
4366
break;
4367
default:
4368
assert(!"invalid bind point");
4369
break;
4370
}
4371
}
4372
4373
void
4374
radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
4375
const VkViewport *pViewports)
4376
{
4377
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4378
struct radv_cmd_state *state = &cmd_buffer->state;
4379
ASSERTED const uint32_t total_count = firstViewport + viewportCount;
4380
4381
assert(firstViewport < MAX_VIEWPORTS);
4382
assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
4383
4384
if (total_count <= state->dynamic.viewport.count &&
4385
!memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports,
4386
viewportCount * sizeof(*pViewports))) {
4387
return;
4388
}
4389
4390
if (state->dynamic.viewport.count < total_count)
4391
state->dynamic.viewport.count = total_count;
4392
4393
memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
4394
viewportCount * sizeof(*pViewports));
4395
4396
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
4397
}
4398
4399
void
4400
radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
4401
const VkRect2D *pScissors)
4402
{
4403
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4404
struct radv_cmd_state *state = &cmd_buffer->state;
4405
ASSERTED const uint32_t total_count = firstScissor + scissorCount;
4406
4407
assert(firstScissor < MAX_SCISSORS);
4408
assert(total_count >= 1 && total_count <= MAX_SCISSORS);
4409
4410
if (total_count <= state->dynamic.scissor.count &&
4411
!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
4412
scissorCount * sizeof(*pScissors))) {
4413
return;
4414
}
4415
4416
if (state->dynamic.scissor.count < total_count)
4417
state->dynamic.scissor.count = total_count;
4418
4419
memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
4420
scissorCount * sizeof(*pScissors));
4421
4422
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
4423
}
4424
4425
void
4426
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
4427
{
4428
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4429
4430
if (cmd_buffer->state.dynamic.line_width == lineWidth)
4431
return;
4432
4433
cmd_buffer->state.dynamic.line_width = lineWidth;
4434
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
4435
}
4436
4437
void
4438
radv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
4439
float depthBiasClamp, float depthBiasSlopeFactor)
4440
{
4441
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4442
struct radv_cmd_state *state = &cmd_buffer->state;
4443
4444
if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
4445
state->dynamic.depth_bias.clamp == depthBiasClamp &&
4446
state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
4447
return;
4448
}
4449
4450
state->dynamic.depth_bias.bias = depthBiasConstantFactor;
4451
state->dynamic.depth_bias.clamp = depthBiasClamp;
4452
state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
4453
4454
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
4455
}
4456
4457
void
4458
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
4459
{
4460
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4461
struct radv_cmd_state *state = &cmd_buffer->state;
4462
4463
if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
4464
return;
4465
4466
memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
4467
4468
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
4469
}
4470
4471
void
4472
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
4473
{
4474
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4475
struct radv_cmd_state *state = &cmd_buffer->state;
4476
4477
if (state->dynamic.depth_bounds.min == minDepthBounds &&
4478
state->dynamic.depth_bounds.max == maxDepthBounds) {
4479
return;
4480
}
4481
4482
state->dynamic.depth_bounds.min = minDepthBounds;
4483
state->dynamic.depth_bounds.max = maxDepthBounds;
4484
4485
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
4486
}
4487
4488
void
4489
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
4490
uint32_t compareMask)
4491
{
4492
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4493
struct radv_cmd_state *state = &cmd_buffer->state;
4494
bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
4495
bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
4496
4497
if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4498
(!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4499
return;
4500
}
4501
4502
if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4503
state->dynamic.stencil_compare_mask.front = compareMask;
4504
if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4505
state->dynamic.stencil_compare_mask.back = compareMask;
4506
4507
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
4508
}
4509
4510
void
4511
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
4512
uint32_t writeMask)
4513
{
4514
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4515
struct radv_cmd_state *state = &cmd_buffer->state;
4516
bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
4517
bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
4518
4519
if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4520
(!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4521
return;
4522
}
4523
4524
if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4525
state->dynamic.stencil_write_mask.front = writeMask;
4526
if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4527
state->dynamic.stencil_write_mask.back = writeMask;
4528
4529
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
4530
}
4531
4532
void
4533
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
4534
uint32_t reference)
4535
{
4536
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4537
struct radv_cmd_state *state = &cmd_buffer->state;
4538
bool front_same = state->dynamic.stencil_reference.front == reference;
4539
bool back_same = state->dynamic.stencil_reference.back == reference;
4540
4541
if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4542
(!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
4543
return;
4544
}
4545
4546
if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
4547
cmd_buffer->state.dynamic.stencil_reference.front = reference;
4548
if (faceMask & VK_STENCIL_FACE_BACK_BIT)
4549
cmd_buffer->state.dynamic.stencil_reference.back = reference;
4550
4551
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
4552
}
4553
4554
void
4555
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
4556
uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
4557
{
4558
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4559
struct radv_cmd_state *state = &cmd_buffer->state;
4560
ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
4561
4562
assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
4563
assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
4564
4565
if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
4566
pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
4567
return;
4568
}
4569
4570
typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
4571
pDiscardRectangles, discardRectangleCount);
4572
4573
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
4574
}
4575
4576
void
4577
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
4578
const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
4579
{
4580
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4581
struct radv_cmd_state *state = &cmd_buffer->state;
4582
4583
assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
4584
4585
state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
4586
state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
4587
state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
4588
typed_memcpy(&state->dynamic.sample_location.locations[0],
4589
pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
4590
4591
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
4592
}
4593
4594
void
4595
radv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
4596
uint16_t lineStipplePattern)
4597
{
4598
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4599
struct radv_cmd_state *state = &cmd_buffer->state;
4600
4601
if (state->dynamic.line_stipple.factor == lineStippleFactor &&
4602
state->dynamic.line_stipple.pattern == lineStipplePattern)
4603
return;
4604
4605
state->dynamic.line_stipple.factor = lineStippleFactor;
4606
state->dynamic.line_stipple.pattern = lineStipplePattern;
4607
4608
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
4609
}
4610
4611
void
4612
radv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
4613
{
4614
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4615
struct radv_cmd_state *state = &cmd_buffer->state;
4616
4617
if (state->dynamic.cull_mode == cullMode)
4618
return;
4619
4620
state->dynamic.cull_mode = cullMode;
4621
4622
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
4623
}
4624
4625
void
4626
radv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
4627
{
4628
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4629
struct radv_cmd_state *state = &cmd_buffer->state;
4630
4631
if (state->dynamic.front_face == frontFace)
4632
return;
4633
4634
state->dynamic.front_face = frontFace;
4635
4636
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
4637
}
4638
4639
void
4640
radv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
4641
VkPrimitiveTopology primitiveTopology)
4642
{
4643
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4644
struct radv_cmd_state *state = &cmd_buffer->state;
4645
unsigned primitive_topology = si_translate_prim(primitiveTopology);
4646
4647
if (state->dynamic.primitive_topology == primitive_topology)
4648
return;
4649
4650
state->dynamic.primitive_topology = primitive_topology;
4651
4652
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
4653
}
4654
4655
void
4656
radv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount,
4657
const VkViewport *pViewports)
4658
{
4659
radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
4660
}
4661
4662
void
4663
radv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount,
4664
const VkRect2D *pScissors)
4665
{
4666
radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
4667
}
4668
4669
void
4670
radv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
4671
4672
{
4673
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4674
struct radv_cmd_state *state = &cmd_buffer->state;
4675
4676
if (state->dynamic.depth_test_enable == depthTestEnable)
4677
return;
4678
4679
state->dynamic.depth_test_enable = depthTestEnable;
4680
4681
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
4682
}
4683
4684
void
4685
radv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
4686
{
4687
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4688
struct radv_cmd_state *state = &cmd_buffer->state;
4689
4690
if (state->dynamic.depth_write_enable == depthWriteEnable)
4691
return;
4692
4693
state->dynamic.depth_write_enable = depthWriteEnable;
4694
4695
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
4696
}
4697
4698
void
4699
radv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
4700
{
4701
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4702
struct radv_cmd_state *state = &cmd_buffer->state;
4703
4704
if (state->dynamic.depth_compare_op == depthCompareOp)
4705
return;
4706
4707
state->dynamic.depth_compare_op = depthCompareOp;
4708
4709
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
4710
}
4711
4712
void
4713
radv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
4714
{
4715
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4716
struct radv_cmd_state *state = &cmd_buffer->state;
4717
4718
if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable)
4719
return;
4720
4721
state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
4722
4723
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
4724
}
4725
4726
void
4727
radv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
4728
{
4729
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4730
struct radv_cmd_state *state = &cmd_buffer->state;
4731
4732
if (state->dynamic.stencil_test_enable == stencilTestEnable)
4733
return;
4734
4735
state->dynamic.stencil_test_enable = stencilTestEnable;
4736
4737
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
4738
}
4739
4740
void
4741
radv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
4742
VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
4743
VkCompareOp compareOp)
4744
{
4745
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4746
struct radv_cmd_state *state = &cmd_buffer->state;
4747
bool front_same = state->dynamic.stencil_op.front.fail_op == failOp &&
4748
state->dynamic.stencil_op.front.pass_op == passOp &&
4749
state->dynamic.stencil_op.front.depth_fail_op == depthFailOp &&
4750
state->dynamic.stencil_op.front.compare_op == compareOp;
4751
bool back_same = state->dynamic.stencil_op.back.fail_op == failOp &&
4752
state->dynamic.stencil_op.back.pass_op == passOp &&
4753
state->dynamic.stencil_op.back.depth_fail_op == depthFailOp &&
4754
state->dynamic.stencil_op.back.compare_op == compareOp;
4755
4756
if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
4757
(!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same))
4758
return;
4759
4760
if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
4761
state->dynamic.stencil_op.front.fail_op = failOp;
4762
state->dynamic.stencil_op.front.pass_op = passOp;
4763
state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
4764
state->dynamic.stencil_op.front.compare_op = compareOp;
4765
}
4766
4767
if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
4768
state->dynamic.stencil_op.back.fail_op = failOp;
4769
state->dynamic.stencil_op.back.pass_op = passOp;
4770
state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
4771
state->dynamic.stencil_op.back.compare_op = compareOp;
4772
}
4773
4774
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
4775
}
4776
4777
void
4778
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
4779
const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
4780
{
4781
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4782
struct radv_cmd_state *state = &cmd_buffer->state;
4783
4784
if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width &&
4785
state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height &&
4786
state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] &&
4787
state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1])
4788
return;
4789
4790
state->dynamic.fragment_shading_rate.size = *pFragmentSize;
4791
for (unsigned i = 0; i < 2; i++)
4792
state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
4793
4794
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
4795
}
4796
4797
void
4798
radv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
4799
{
4800
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4801
struct radv_cmd_state *state = &cmd_buffer->state;
4802
4803
if (state->dynamic.depth_bias_enable == depthBiasEnable)
4804
return;
4805
4806
state->dynamic.depth_bias_enable = depthBiasEnable;
4807
4808
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
4809
}
4810
4811
void
4812
radv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
4813
{
4814
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4815
struct radv_cmd_state *state = &cmd_buffer->state;
4816
4817
if (state->dynamic.primitive_restart_enable == primitiveRestartEnable)
4818
return;
4819
4820
state->dynamic.primitive_restart_enable = primitiveRestartEnable;
4821
4822
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
4823
}
4824
4825
void
4826
radv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
4827
VkBool32 rasterizerDiscardEnable)
4828
{
4829
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4830
struct radv_cmd_state *state = &cmd_buffer->state;
4831
4832
if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable)
4833
return;
4834
4835
state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
4836
4837
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
4838
}
4839
4840
void
4841
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
4842
{
4843
/* not implemented */
4844
}
4845
4846
void
4847
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
4848
{
4849
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4850
struct radv_cmd_state *state = &cmd_buffer->state;
4851
unsigned logic_op = si_translate_blend_logic_op(logicOp);
4852
4853
if (state->dynamic.logic_op == logic_op)
4854
return;
4855
4856
state->dynamic.logic_op = logic_op;
4857
4858
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
4859
}
4860
4861
void
4862
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
4863
const VkBool32 *pColorWriteEnables)
4864
{
4865
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
4866
struct radv_cmd_state *state = &cmd_buffer->state;
4867
uint32_t color_write_enable = 0;
4868
4869
assert(attachmentCount < MAX_RTS);
4870
4871
for (uint32_t i = 0; i < attachmentCount; i++) {
4872
color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
4873
}
4874
4875
if (state->dynamic.color_write_enable == color_write_enable)
4876
return;
4877
4878
state->dynamic.color_write_enable = color_write_enable;
4879
4880
state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
4881
}
4882
4883
void
4884
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
4885
const VkCommandBuffer *pCmdBuffers)
4886
{
4887
RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
4888
4889
assert(commandBufferCount > 0);
4890
4891
radv_emit_mip_change_flush_default(primary);
4892
4893
/* Emit pending flushes on primary prior to executing secondary */
4894
si_emit_cache_flush(primary);
4895
4896
/* Make sure CP DMA is idle on primary prior to executing secondary. */
4897
si_cp_dma_wait_for_idle(primary);
4898
4899
for (uint32_t i = 0; i < commandBufferCount; i++) {
4900
RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
4901
bool allow_ib2 = true;
4902
4903
if (secondary->device->physical_device->rad_info.chip_class == GFX7 &&
4904
secondary->state.uses_draw_indirect_multi) {
4905
/* Do not launch an IB2 for secondary command buffers that contain
4906
* DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
4907
*/
4908
allow_ib2 = false;
4909
}
4910
4911
primary->scratch_size_per_wave_needed =
4912
MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
4913
primary->scratch_waves_wanted =
4914
MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
4915
primary->compute_scratch_size_per_wave_needed =
4916
MAX2(primary->compute_scratch_size_per_wave_needed,
4917
secondary->compute_scratch_size_per_wave_needed);
4918
primary->compute_scratch_waves_wanted =
4919
MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
4920
4921
if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
4922
primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
4923
if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
4924
primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
4925
if (secondary->tess_rings_needed)
4926
primary->tess_rings_needed = true;
4927
if (secondary->sample_positions_needed)
4928
primary->sample_positions_needed = true;
4929
if (secondary->gds_needed)
4930
primary->gds_needed = true;
4931
4932
if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
4933
/* Emit the framebuffer state from primary if secondary
4934
* has been recorded without a framebuffer, otherwise
4935
* fast color/depth clears can't work.
4936
*/
4937
radv_emit_fb_mip_change_flush(primary);
4938
radv_emit_framebuffer_state(primary);
4939
}
4940
4941
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
4942
4943
/* When the secondary command buffer is compute only we don't
4944
* need to re-emit the current graphics pipeline.
4945
*/
4946
if (secondary->state.emitted_pipeline) {
4947
primary->state.emitted_pipeline = secondary->state.emitted_pipeline;
4948
}
4949
4950
/* When the secondary command buffer is graphics only we don't
4951
* need to re-emit the current compute pipeline.
4952
*/
4953
if (secondary->state.emitted_compute_pipeline) {
4954
primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
4955
}
4956
4957
/* Only re-emit the draw packets when needed. */
4958
if (secondary->state.last_primitive_reset_en != -1) {
4959
primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
4960
}
4961
4962
if (secondary->state.last_primitive_reset_index) {
4963
primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
4964
}
4965
4966
if (secondary->state.last_ia_multi_vgt_param) {
4967
primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
4968
}
4969
4970
primary->state.last_first_instance = secondary->state.last_first_instance;
4971
primary->state.last_num_instances = secondary->state.last_num_instances;
4972
primary->state.last_drawid = secondary->state.last_drawid;
4973
primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
4974
primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
4975
primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
4976
primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
4977
4978
if (secondary->state.last_index_type != -1) {
4979
primary->state.last_index_type = secondary->state.last_index_type;
4980
}
4981
4982
primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
4983
primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
4984
primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
4985
}
4986
4987
/* After executing commands from secondary buffers we have to dirty
4988
* some states.
4989
*/
4990
primary->state.dirty |=
4991
RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
4992
radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
4993
radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
4994
}
4995
4996
VkResult
4997
radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
4998
const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
4999
{
5000
RADV_FROM_HANDLE(radv_device, device, _device);
5001
struct radv_cmd_pool *pool;
5002
5003
pool =
5004
vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
5005
if (pool == NULL)
5006
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
5007
5008
vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL);
5009
5010
if (pAllocator)
5011
pool->alloc = *pAllocator;
5012
else
5013
pool->alloc = device->vk.alloc;
5014
5015
list_inithead(&pool->cmd_buffers);
5016
list_inithead(&pool->free_cmd_buffers);
5017
5018
pool->queue_family_index = pCreateInfo->queueFamilyIndex;
5019
5020
*pCmdPool = radv_cmd_pool_to_handle(pool);
5021
5022
return VK_SUCCESS;
5023
}
5024
5025
void
5026
radv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
5027
const VkAllocationCallbacks *pAllocator)
5028
{
5029
RADV_FROM_HANDLE(radv_device, device, _device);
5030
RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5031
5032
if (!pool)
5033
return;
5034
5035
list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5036
{
5037
radv_destroy_cmd_buffer(cmd_buffer);
5038
}
5039
5040
list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5041
{
5042
radv_destroy_cmd_buffer(cmd_buffer);
5043
}
5044
5045
vk_object_base_finish(&pool->base);
5046
vk_free2(&device->vk.alloc, pAllocator, pool);
5047
}
5048
5049
VkResult
5050
radv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
5051
{
5052
RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5053
VkResult result;
5054
5055
list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
5056
{
5057
result = radv_reset_cmd_buffer(cmd_buffer);
5058
if (result != VK_SUCCESS)
5059
return result;
5060
}
5061
5062
return VK_SUCCESS;
5063
}
5064
5065
void
5066
radv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
5067
{
5068
RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
5069
5070
if (!pool)
5071
return;
5072
5073
list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
5074
{
5075
radv_destroy_cmd_buffer(cmd_buffer);
5076
}
5077
}
5078
5079
static void
5080
radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
5081
{
5082
struct radv_cmd_state *state = &cmd_buffer->state;
5083
struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
5084
5085
ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
5086
5087
radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
5088
5089
radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
5090
5091
radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5092
5093
for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5094
const uint32_t a = subpass->attachments[i].attachment;
5095
if (a == VK_ATTACHMENT_UNUSED)
5096
continue;
5097
5098
radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
5099
}
5100
5101
if (subpass->vrs_attachment) {
5102
int idx = subpass->vrs_attachment->attachment;
5103
struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
5104
5105
if (subpass->depth_stencil_attachment) {
5106
/* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
5107
* copy the VRS rates to the HTILE buffer of the attachment.
5108
*/
5109
int ds_idx = subpass->depth_stencil_attachment->attachment;
5110
struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
5111
5112
VkExtent2D extent = {
5113
.width = ds_iview->image->info.width,
5114
.height = ds_iview->image->info.height,
5115
};
5116
5117
/* Copy the VRS rates to the HTILE buffer. */
5118
radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_iview->image);
5119
} else {
5120
/* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
5121
* to copy the VRS rates to our internal HTILE buffer.
5122
*/
5123
struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
5124
struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
5125
uint32_t htile_value;
5126
5127
if (ds_image) {
5128
htile_value = radv_get_htile_initial_value(cmd_buffer->device, ds_image);
5129
5130
VkExtent2D extent = {
5131
.width = MIN2(fb->width, ds_image->info.width),
5132
.height = MIN2(fb->height, ds_image->info.height),
5133
};
5134
5135
/* Clear the HTILE buffer before copying VRS rates because it's a read-modify-write
5136
* operation.
5137
*/
5138
VkImageSubresourceRange range = {
5139
.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
5140
.baseMipLevel = 0,
5141
.levelCount = 1,
5142
.baseArrayLayer = 0,
5143
.layerCount = 1,
5144
};
5145
5146
cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, ds_image, &range, htile_value);
5147
5148
/* Copy the VRS rates to the HTILE buffer. */
5149
radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image);
5150
}
5151
}
5152
}
5153
5154
radv_describe_barrier_end(cmd_buffer);
5155
5156
radv_cmd_buffer_clear_subpass(cmd_buffer);
5157
5158
assert(cmd_buffer->cs->cdw <= cdw_max);
5159
}
5160
5161
static void
5162
radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
5163
{
5164
struct radv_cmd_state *state = &cmd_buffer->state;
5165
const struct radv_subpass *subpass = state->subpass;
5166
uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
5167
5168
radv_cmd_buffer_resolve_subpass(cmd_buffer);
5169
5170
radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
5171
5172
for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
5173
const uint32_t a = subpass->attachments[i].attachment;
5174
if (a == VK_ATTACHMENT_UNUSED)
5175
continue;
5176
5177
if (state->pass->attachments[a].last_subpass_idx != subpass_id)
5178
continue;
5179
5180
VkImageLayout layout = state->pass->attachments[a].final_layout;
5181
VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
5182
struct radv_subpass_attachment att = {a, layout, stencil_layout};
5183
radv_handle_subpass_image_transition(cmd_buffer, att, false);
5184
}
5185
5186
radv_describe_barrier_end(cmd_buffer);
5187
}
5188
5189
void
5190
radv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer,
5191
const VkRenderPassBeginInfo *pRenderPassBegin,
5192
const struct radv_extra_render_pass_begin_info *extra_info)
5193
{
5194
RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
5195
RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
5196
VkResult result;
5197
5198
cmd_buffer->state.framebuffer = framebuffer;
5199
cmd_buffer->state.pass = pass;
5200
cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
5201
5202
result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info);
5203
if (result != VK_SUCCESS)
5204
return;
5205
5206
result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
5207
if (result != VK_SUCCESS)
5208
return;
5209
}
5210
5211
void
5212
radv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
5213
const VkRenderPassBeginInfo *pRenderPassBeginInfo,
5214
const VkSubpassBeginInfo *pSubpassBeginInfo)
5215
{
5216
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5217
5218
radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL);
5219
5220
radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
5221
}
5222
5223
void
5224
radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
5225
const VkSubpassEndInfo *pSubpassEndInfo)
5226
{
5227
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
5228
5229
uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
5230
radv_cmd_buffer_end_subpass(cmd_buffer);
5231
radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
5232
}
5233
5234
static void
5235
radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
5236
{
5237
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5238
for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
5239
if (!radv_get_shader(pipeline, stage))
5240
continue;
5241
5242
struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
5243
if (loc->sgpr_idx == -1)
5244
continue;
5245
uint32_t base_reg = pipeline->user_data_0[stage];
5246
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5247
}
5248
if (radv_pipeline_has_gs_copy_shader(pipeline)) {
5249
struct radv_userdata_info *loc =
5250
&pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
5251
if (loc->sgpr_idx != -1) {
5252
uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
5253
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
5254
}
5255
}
5256
}
5257
5258
static void
5259
radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
5260
uint32_t use_opaque)
5261
{
5262
radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
5263
radeon_emit(cmd_buffer->cs, vertex_count);
5264
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
5265
}
5266
5267
/**
5268
* Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
5269
*
5270
* The starting address "index_va" may point anywhere within the index buffer. The number of
5271
* indexes allocated in the index buffer *past that point* is specified by "max_index_count".
5272
* Hardware uses this information to return 0 for out-of-bounds reads.
5273
*/
5274
static void
5275
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
5276
uint32_t max_index_count, uint32_t index_count, bool not_eop)
5277
{
5278
radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
5279
radeon_emit(cmd_buffer->cs, max_index_count);
5280
radeon_emit(cmd_buffer->cs, index_va);
5281
radeon_emit(cmd_buffer->cs, index_va >> 32);
5282
radeon_emit(cmd_buffer->cs, index_count);
5283
/* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
5284
* can be changed between draws and GS fast launch must be disabled.
5285
* NOT_EOP doesn't work on gfx9 and older.
5286
*/
5287
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
5288
}
5289
5290
/* MUST inline this function to avoid massive perf loss in drawoverhead */
5291
ALWAYS_INLINE static void
5292
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
5293
uint32_t draw_count, uint64_t count_va, uint32_t stride)
5294
{
5295
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5296
const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
5297
bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid;
5298
uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
5299
uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
5300
bool predicating = cmd_buffer->state.predicating;
5301
assert(base_reg);
5302
5303
/* just reset draw state for vertex data */
5304
cmd_buffer->state.last_first_instance = -1;
5305
cmd_buffer->state.last_num_instances = -1;
5306
cmd_buffer->state.last_drawid = -1;
5307
cmd_buffer->state.last_vertex_offset = -1;
5308
5309
vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
5310
if (cmd_buffer->state.pipeline->graphics.uses_baseinstance)
5311
start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
5312
if (draw_id_enable)
5313
draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
5314
5315
if (draw_count == 1 && !count_va && !draw_id_enable) {
5316
radeon_emit(cs,
5317
PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
5318
radeon_emit(cs, 0);
5319
radeon_emit(cs, vertex_offset_reg);
5320
radeon_emit(cs, start_instance_reg);
5321
radeon_emit(cs, di_src_sel);
5322
} else {
5323
radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
5324
predicating));
5325
radeon_emit(cs, 0);
5326
radeon_emit(cs, vertex_offset_reg);
5327
radeon_emit(cs, start_instance_reg);
5328
radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
5329
S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
5330
radeon_emit(cs, draw_count); /* count */
5331
radeon_emit(cs, count_va); /* count_addr */
5332
radeon_emit(cs, count_va >> 32);
5333
radeon_emit(cs, stride); /* stride */
5334
radeon_emit(cs, di_src_sel);
5335
5336
cmd_buffer->state.uses_draw_indirect_multi = true;
5337
}
5338
}
5339
5340
static inline void
5341
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
5342
const struct radv_draw_info *info, const uint32_t vertex_offset)
5343
{
5344
struct radv_cmd_state *state = &cmd_buffer->state;
5345
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5346
const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
5347
const bool uses_drawid = state->pipeline->graphics.uses_drawid;
5348
radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
5349
state->pipeline->graphics.vtx_emit_num);
5350
5351
radeon_emit(cs, vertex_offset);
5352
state->last_vertex_offset = vertex_offset;
5353
if (uses_drawid) {
5354
radeon_emit(cs, 0);
5355
state->last_drawid = 0;
5356
}
5357
if (uses_baseinstance) {
5358
radeon_emit(cs, info->first_instance);
5359
state->last_first_instance = info->first_instance;
5360
}
5361
}
5362
5363
ALWAYS_INLINE static void
5364
radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
5365
const uint32_t vertex_offset)
5366
{
5367
const struct radv_cmd_state *state = &cmd_buffer->state;
5368
const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
5369
const bool uses_drawid = state->pipeline->graphics.uses_drawid;
5370
5371
/* this looks very dumb, but it allows the compiler to optimize better and yields
5372
* ~3-4% perf increase in drawoverhead
5373
*/
5374
if (vertex_offset != state->last_vertex_offset) {
5375
radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
5376
} else if (uses_drawid && 0 != state->last_drawid) {
5377
radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
5378
} else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
5379
radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
5380
}
5381
}
5382
5383
ALWAYS_INLINE static void
5384
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
5385
{
5386
struct radv_cmd_state *state = &cmd_buffer->state;
5387
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5388
radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid);
5389
radeon_emit(cs, vertex_offset);
5390
state->last_vertex_offset = vertex_offset;
5391
if (drawid)
5392
radeon_emit(cs, drawid);
5393
5394
}
5395
5396
ALWAYS_INLINE static void
5397
radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
5398
const struct radv_draw_info *info,
5399
uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
5400
uint32_t stride,
5401
const int32_t *vertexOffset)
5402
5403
{
5404
struct radv_cmd_state *state = &cmd_buffer->state;
5405
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5406
const int index_size = radv_get_vgt_index_size(state->index_type);
5407
unsigned i = 0;
5408
const bool uses_drawid = state->pipeline->graphics.uses_drawid;
5409
const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10;
5410
5411
if (uses_drawid) {
5412
if (vertexOffset) {
5413
radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
5414
vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
5415
const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
5416
5417
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
5418
if (!remaining_indexes &&
5419
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
5420
continue;
5421
5422
if (i > 0)
5423
radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
5424
5425
const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
5426
5427
if (!state->subpass->view_mask) {
5428
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5429
} else {
5430
u_foreach_bit(view, state->subpass->view_mask) {
5431
radv_emit_view_index(cmd_buffer, view);
5432
5433
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5434
}
5435
}
5436
}
5437
} else {
5438
vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
5439
const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
5440
5441
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
5442
if (!remaining_indexes &&
5443
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
5444
continue;
5445
5446
if (i > 0) {
5447
if (state->last_vertex_offset != draw->vertexOffset)
5448
radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
5449
else
5450
radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
5451
} else
5452
radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
5453
5454
const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
5455
5456
if (!state->subpass->view_mask) {
5457
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5458
} else {
5459
u_foreach_bit(view, state->subpass->view_mask) {
5460
radv_emit_view_index(cmd_buffer, view);
5461
5462
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5463
}
5464
}
5465
}
5466
}
5467
if (drawCount > 1) {
5468
state->last_drawid = drawCount - 1;
5469
}
5470
} else {
5471
if (vertexOffset) {
5472
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) {
5473
/* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
5474
* count == 0 for the last draw that doesn't have NOT_EOP.
5475
*/
5476
while (drawCount > 1) {
5477
const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
5478
if (last->indexCount)
5479
break;
5480
drawCount--;
5481
}
5482
}
5483
5484
radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
5485
vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
5486
const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
5487
5488
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
5489
if (!remaining_indexes &&
5490
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
5491
continue;
5492
5493
const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
5494
5495
if (!state->subpass->view_mask) {
5496
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
5497
} else {
5498
u_foreach_bit(view, state->subpass->view_mask) {
5499
radv_emit_view_index(cmd_buffer, view);
5500
5501
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5502
}
5503
}
5504
}
5505
} else {
5506
vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
5507
const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
5508
5509
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
5510
if (!remaining_indexes &&
5511
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
5512
continue;
5513
5514
const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
5515
const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
5516
radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
5517
5518
const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
5519
5520
if (!state->subpass->view_mask) {
5521
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
5522
} else {
5523
u_foreach_bit(view, state->subpass->view_mask) {
5524
radv_emit_view_index(cmd_buffer, view);
5525
5526
radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
5527
}
5528
}
5529
}
5530
}
5531
if (drawCount > 1) {
5532
state->last_drawid = drawCount - 1;
5533
}
5534
}
5535
}
5536
5537
ALWAYS_INLINE static void
5538
radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
5539
uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
5540
uint32_t use_opaque, uint32_t stride)
5541
{
5542
unsigned i = 0;
5543
const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
5544
const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid;
5545
uint32_t last_start = 0;
5546
5547
vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
5548
if (!i)
5549
radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
5550
else
5551
radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
5552
5553
if (!view_mask) {
5554
radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
5555
} else {
5556
u_foreach_bit(view, view_mask) {
5557
radv_emit_view_index(cmd_buffer, view);
5558
radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
5559
}
5560
}
5561
last_start = draw->firstVertex;
5562
}
5563
if (drawCount > 1) {
5564
struct radv_cmd_state *state = &cmd_buffer->state;
5565
state->last_vertex_offset = last_start;
5566
if (uses_drawid)
5567
state->last_drawid = drawCount - 1;
5568
}
5569
}
5570
5571
static void
5572
radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
5573
const struct radv_draw_info *info)
5574
{
5575
const struct radv_cmd_state *state = &cmd_buffer->state;
5576
struct radeon_winsys *ws = cmd_buffer->device->ws;
5577
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5578
const uint64_t va =
5579
radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
5580
const uint64_t count_va = info->count_buffer
5581
? radv_buffer_get_va(info->count_buffer->bo) +
5582
info->count_buffer->offset + info->count_buffer_offset
5583
: 0;
5584
5585
radv_cs_add_buffer(ws, cs, info->indirect->bo);
5586
5587
radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
5588
radeon_emit(cs, 1);
5589
radeon_emit(cs, va);
5590
radeon_emit(cs, va >> 32);
5591
5592
if (info->count_buffer) {
5593
radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
5594
}
5595
5596
if (!state->subpass->view_mask) {
5597
radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
5598
info->stride);
5599
} else {
5600
u_foreach_bit(i, state->subpass->view_mask)
5601
{
5602
radv_emit_view_index(cmd_buffer, i);
5603
5604
radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
5605
info->stride);
5606
}
5607
}
5608
}
5609
5610
/*
5611
* Vega and raven have a bug which triggers if there are multiple context
5612
* register contexts active at the same time with different scissor values.
5613
*
5614
* There are two possible workarounds:
5615
* 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
5616
* there is only ever 1 active set of scissor values at the same time.
5617
*
5618
* 2) Whenever the hardware switches contexts we have to set the scissor
5619
* registers again even if it is a noop. That way the new context gets
5620
* the correct scissor values.
5621
*
5622
* This implements option 2. radv_need_late_scissor_emission needs to
5623
* return true on affected HW if radv_emit_all_graphics_states sets
5624
* any context registers.
5625
*/
5626
static bool
5627
radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
5628
const struct radv_draw_info *info)
5629
{
5630
struct radv_cmd_state *state = &cmd_buffer->state;
5631
5632
if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
5633
return false;
5634
5635
if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
5636
return true;
5637
5638
uint64_t used_states =
5639
cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
5640
5641
/* Index, vertex and streamout buffers don't change context regs, and
5642
* pipeline is already handled.
5643
*/
5644
used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
5645
RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE);
5646
5647
if (cmd_buffer->state.dirty & used_states)
5648
return true;
5649
5650
uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
5651
5652
if (info->indexed && state->dynamic.primitive_restart_enable &&
5653
primitive_reset_index != state->last_primitive_reset_index)
5654
return true;
5655
5656
return false;
5657
}
5658
5659
enum {
5660
ngg_cull_none = 0,
5661
ngg_cull_front_face = 1,
5662
ngg_cull_back_face = 2,
5663
ngg_cull_face_is_ccw = 4,
5664
ngg_cull_small_primitives = 8,
5665
};
5666
5667
ALWAYS_INLINE static bool
5668
radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
5669
bool indirect)
5670
{
5671
/* If we have to draw only a few vertices, we get better latency if
5672
* we disable NGG culling.
5673
*
5674
* When tessellation is used, what matters is the number of tessellated
5675
* vertices, so let's always assume it's not a small draw.
5676
*/
5677
return !has_tess && !indirect && vtx_cnt < 512;
5678
}
5679
5680
ALWAYS_INLINE static uint32_t
5681
radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
5682
{
5683
const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5684
const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5685
5686
/* Cull every triangle when rasterizer discard is enabled. */
5687
if (d->rasterizer_discard_enable ||
5688
G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
5689
return ngg_cull_front_face | ngg_cull_back_face;
5690
5691
uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
5692
uint32_t nggc_settings = ngg_cull_none;
5693
5694
/* The culling code needs to know whether face is CW or CCW. */
5695
bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
5696
? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
5697
: G_028814_FACE(pa_su_sc_mode_cntl) == 0;
5698
5699
/* Take inverted viewport into account. */
5700
ccw ^= vp_y_inverted;
5701
5702
if (ccw)
5703
nggc_settings |= ngg_cull_face_is_ccw;
5704
5705
/* Face culling settings. */
5706
if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
5707
? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
5708
: G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
5709
nggc_settings |= ngg_cull_front_face;
5710
if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
5711
? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
5712
: G_028814_CULL_BACK(pa_su_sc_mode_cntl))
5713
nggc_settings |= ngg_cull_back_face;
5714
5715
/* Small primitive culling is only valid when conservative overestimation is not used. */
5716
if (!pipeline->graphics.uses_conservative_overestimate) {
5717
nggc_settings |= ngg_cull_small_primitives;
5718
5719
/* small_prim_precision = num_samples / 2^subpixel_bits
5720
* num_samples is also always a power of two, so the small prim precision can only be
5721
* a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
5722
*/
5723
unsigned subpixel_bits = 256;
5724
int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
5725
nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
5726
}
5727
5728
return nggc_settings;
5729
}
5730
5731
static void
5732
radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
5733
{
5734
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
5735
const unsigned stage = pipeline->graphics.last_vgt_api_stage;
5736
const bool nggc_supported = pipeline->graphics.has_ngg_culling;
5737
5738
if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
5739
/* Current shader doesn't support culling and culling was already disabled:
5740
* No further steps needed, just remember the SGPR's location is not set.
5741
*/
5742
cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
5743
return;
5744
}
5745
5746
/* Check dirty flags:
5747
* - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
5748
* - Dirty dynamic flags: culling settings may have changed.
5749
*/
5750
const bool dirty =
5751
cmd_buffer->state.dirty &
5752
(RADV_CMD_DIRTY_PIPELINE |
5753
RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
5754
RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
5755
5756
/* Check small draw status:
5757
* For small draw calls, we disable culling by setting the SGPR to 0.
5758
*/
5759
const bool skip =
5760
radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
5761
5762
/* See if anything changed. */
5763
if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
5764
return;
5765
5766
/* Remember small draw state. */
5767
cmd_buffer->state.last_nggc_skip = skip;
5768
const struct radv_shader_variant *v = pipeline->shaders[stage];
5769
assert(v->info.has_ngg_culling == nggc_supported);
5770
5771
/* Find the user SGPR. */
5772
const uint32_t base_reg = pipeline->user_data_0[stage];
5773
const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
5774
assert(!nggc_supported || nggc_sgpr_idx != -1);
5775
5776
/* Get viewport transform. */
5777
float vp_scale[3], vp_translate[3];
5778
radv_get_viewport_xform(&cmd_buffer->state.dynamic.viewport.viewports[0], vp_scale, vp_translate);
5779
bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
5780
5781
/* Get current culling settings. */
5782
uint32_t nggc_settings = nggc_supported && !skip
5783
? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
5784
: ngg_cull_none;
5785
5786
bool emit_viewport = nggc_settings &&
5787
(cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
5788
cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
5789
!cmd_buffer->state.last_nggc_settings);
5790
5791
if (emit_viewport) {
5792
/* Correction for inverted Y */
5793
if (vp_y_inverted) {
5794
vp_scale[1] = -vp_scale[1];
5795
vp_translate[1] = -vp_translate[1];
5796
}
5797
5798
/* Correction for number of samples per pixel. */
5799
for (unsigned i = 0; i < 2; ++i) {
5800
vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
5801
vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
5802
}
5803
5804
uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
5805
const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
5806
assert(vp_sgpr_idx != -1);
5807
radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
5808
radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
5809
}
5810
5811
bool emit_settings = nggc_supported &&
5812
(cmd_buffer->state.last_nggc_settings != nggc_settings ||
5813
cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
5814
5815
/* This needs to be emitted when culling is turned on
5816
* and when it's already on but some settings change.
5817
*/
5818
if (emit_settings) {
5819
assert(nggc_sgpr_idx >= 0);
5820
radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
5821
}
5822
5823
/* These only need to be emitted when culling is turned on or off,
5824
* but not when it stays on and just some settings change.
5825
*/
5826
if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
5827
const struct radv_physical_device *physical_device = cmd_buffer->device->physical_device;
5828
uint32_t rsrc2 = v->config.rsrc2;
5829
uint32_t oversub_pc_lines = physical_device->rad_info.pc_lines / 4;
5830
5831
if (nggc_settings) {
5832
/* Tweak the parameter cache oversubscription.
5833
* This allows the HW to launch more NGG workgroups than the pre-allocated parameter
5834
* cache would normally allow, yielding better perf when culling is on.
5835
*/
5836
oversub_pc_lines = physical_device->rad_info.pc_lines * 3 / 4;
5837
} else {
5838
/* Allocate less LDS when culling is disabled. (But GS always needs it.) */
5839
if (stage != MESA_SHADER_GEOMETRY)
5840
rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
5841
}
5842
5843
/* When the pipeline is dirty and not yet emitted, don't write it here
5844
* because radv_emit_graphics_pipeline will overwrite this register.
5845
*/
5846
if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
5847
cmd_buffer->state.emitted_pipeline == pipeline) {
5848
radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
5849
}
5850
5851
/* Update parameter cache oversubscription setting. */
5852
radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC,
5853
S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) |
5854
S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
5855
}
5856
5857
cmd_buffer->state.last_nggc_settings = nggc_settings;
5858
cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
5859
}
5860
5861
static void
5862
radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
5863
{
5864
bool late_scissor_emission;
5865
5866
if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
5867
cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
5868
radv_emit_rbplus_state(cmd_buffer);
5869
5870
if ((cmd_buffer->device->instance->perftest_flags & RADV_PERFTEST_NGGC) &&
5871
cmd_buffer->state.pipeline->graphics.is_ngg)
5872
radv_emit_ngg_culling_state(cmd_buffer, info);
5873
5874
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
5875
radv_emit_graphics_pipeline(cmd_buffer);
5876
5877
/* This should be before the cmd_buffer->state.dirty is cleared
5878
* (excluding RADV_CMD_DIRTY_PIPELINE) and after
5879
* cmd_buffer->state.context_roll_without_scissor_emitted is set. */
5880
late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
5881
5882
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
5883
radv_emit_framebuffer_state(cmd_buffer);
5884
5885
if (info->indexed) {
5886
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
5887
radv_emit_index_buffer(cmd_buffer, info->indirect);
5888
} else {
5889
/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
5890
* so the state must be re-emitted before the next indexed
5891
* draw.
5892
*/
5893
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
5894
cmd_buffer->state.last_index_type = -1;
5895
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
5896
}
5897
}
5898
5899
radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
5900
5901
radv_emit_draw_registers(cmd_buffer, info);
5902
5903
if (late_scissor_emission)
5904
radv_emit_scissor(cmd_buffer);
5905
}
5906
5907
/* MUST inline this function to avoid massive perf loss in drawoverhead */
5908
ALWAYS_INLINE static bool
5909
radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
5910
{
5911
const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
5912
const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
5913
cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
5914
5915
ASSERTED const unsigned cdw_max =
5916
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
5917
5918
if (likely(!info->indirect)) {
5919
/* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
5920
* no workaround for indirect draws, but we can at least skip
5921
* direct draws.
5922
*/
5923
if (unlikely(!info->instance_count))
5924
return false;
5925
5926
/* Handle count == 0. */
5927
if (unlikely(!info->count && !info->strmout_buffer))
5928
return false;
5929
}
5930
5931
/* Need to apply this workaround early as it can set flush flags. */
5932
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
5933
radv_emit_fb_mip_change_flush(cmd_buffer);
5934
5935
/* Use optimal packet order based on whether we need to sync the
5936
* pipeline.
5937
*/
5938
if (cmd_buffer->state.flush_bits &
5939
(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
5940
RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
5941
/* If we have to wait for idle, set all states first, so that
5942
* all SET packets are processed in parallel with previous draw
5943
* calls. Then upload descriptors, set shader pointers, and
5944
* draw, and prefetch at the end. This ensures that the time
5945
* the CUs are idle is very short. (there are only SET_SH
5946
* packets between the wait and the draw)
5947
*/
5948
radv_emit_all_graphics_states(cmd_buffer, info);
5949
si_emit_cache_flush(cmd_buffer);
5950
/* <-- CUs are idle here --> */
5951
5952
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
5953
} else {
5954
/* If we don't wait for idle, start prefetches first, then set
5955
* states, and draw at the end.
5956
*/
5957
si_emit_cache_flush(cmd_buffer);
5958
5959
if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
5960
/* Only prefetch the vertex shader and VBO descriptors
5961
* in order to start the draw as soon as possible.
5962
*/
5963
radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true);
5964
}
5965
5966
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
5967
5968
radv_emit_all_graphics_states(cmd_buffer, info);
5969
}
5970
5971
radv_describe_draw(cmd_buffer);
5972
if (likely(!info->indirect)) {
5973
struct radv_cmd_state *state = &cmd_buffer->state;
5974
struct radeon_cmdbuf *cs = cmd_buffer->cs;
5975
assert(state->pipeline->graphics.vtx_base_sgpr);
5976
if (state->last_num_instances != info->instance_count) {
5977
radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
5978
radeon_emit(cs, info->instance_count);
5979
state->last_num_instances = info->instance_count;
5980
}
5981
}
5982
assert(cmd_buffer->cs->cdw <= cdw_max);
5983
5984
return true;
5985
}
5986
5987
static void
5988
radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
5989
{
5990
const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
5991
bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
5992
/* Start prefetches after the draw has been started. Both will
5993
* run in parallel, but starting the draw first is more
5994
* important.
5995
*/
5996
if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
5997
radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false);
5998
}
5999
6000
/* Workaround for a VGT hang when streamout is enabled.
6001
* It must be done after drawing.
6002
*/
6003
if (cmd_buffer->state.streamout.streamout_enabled &&
6004
(rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
6005
rad_info->family == CHIP_FIJI)) {
6006
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
6007
}
6008
6009
radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
6010
}
6011
6012
void
6013
radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
6014
uint32_t firstVertex, uint32_t firstInstance)
6015
{
6016
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6017
struct radv_draw_info info;
6018
6019
info.count = vertexCount;
6020
info.instance_count = instanceCount;
6021
info.first_instance = firstInstance;
6022
info.strmout_buffer = NULL;
6023
info.indirect = NULL;
6024
info.indexed = false;
6025
6026
if (!radv_before_draw(cmd_buffer, &info, 1))
6027
return;
6028
const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
6029
radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
6030
radv_after_draw(cmd_buffer);
6031
}
6032
6033
void
6034
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
6035
uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
6036
{
6037
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6038
struct radv_draw_info info;
6039
6040
if (!drawCount)
6041
return;
6042
6043
info.count = pVertexInfo->vertexCount;
6044
info.instance_count = instanceCount;
6045
info.first_instance = firstInstance;
6046
info.strmout_buffer = NULL;
6047
info.indirect = NULL;
6048
info.indexed = false;
6049
6050
if (!radv_before_draw(cmd_buffer, &info, drawCount))
6051
return;
6052
radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
6053
radv_after_draw(cmd_buffer);
6054
}
6055
6056
void
6057
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
6058
uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
6059
{
6060
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6061
struct radv_draw_info info;
6062
6063
info.indexed = true;
6064
info.count = indexCount;
6065
info.instance_count = instanceCount;
6066
info.first_instance = firstInstance;
6067
info.strmout_buffer = NULL;
6068
info.indirect = NULL;
6069
6070
if (!radv_before_draw(cmd_buffer, &info, 1))
6071
return;
6072
const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
6073
radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
6074
radv_after_draw(cmd_buffer);
6075
}
6076
6077
void radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
6078
uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
6079
{
6080
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6081
struct radv_draw_info info;
6082
6083
if (!drawCount)
6084
return;
6085
6086
const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
6087
info.indexed = true;
6088
info.count = minfo->indexCount;
6089
info.instance_count = instanceCount;
6090
info.first_instance = firstInstance;
6091
info.strmout_buffer = NULL;
6092
info.indirect = NULL;
6093
6094
if (!radv_before_draw(cmd_buffer, &info, drawCount))
6095
return;
6096
radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
6097
radv_after_draw(cmd_buffer);
6098
}
6099
6100
void
6101
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6102
uint32_t drawCount, uint32_t stride)
6103
{
6104
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6105
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6106
struct radv_draw_info info;
6107
6108
info.count = drawCount;
6109
info.indirect = buffer;
6110
info.indirect_offset = offset;
6111
info.stride = stride;
6112
info.strmout_buffer = NULL;
6113
info.count_buffer = NULL;
6114
info.indexed = false;
6115
info.instance_count = 0;
6116
6117
if (!radv_before_draw(cmd_buffer, &info, 1))
6118
return;
6119
radv_emit_indirect_draw_packets(cmd_buffer, &info);
6120
radv_after_draw(cmd_buffer);
6121
}
6122
6123
void
6124
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6125
uint32_t drawCount, uint32_t stride)
6126
{
6127
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6128
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6129
struct radv_draw_info info;
6130
6131
info.indexed = true;
6132
info.count = drawCount;
6133
info.indirect = buffer;
6134
info.indirect_offset = offset;
6135
info.stride = stride;
6136
info.count_buffer = NULL;
6137
info.strmout_buffer = NULL;
6138
info.instance_count = 0;
6139
6140
if (!radv_before_draw(cmd_buffer, &info, 1))
6141
return;
6142
radv_emit_indirect_draw_packets(cmd_buffer, &info);
6143
radv_after_draw(cmd_buffer);
6144
}
6145
6146
void
6147
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
6148
VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
6149
uint32_t maxDrawCount, uint32_t stride)
6150
{
6151
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6152
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6153
RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6154
struct radv_draw_info info;
6155
6156
info.count = maxDrawCount;
6157
info.indirect = buffer;
6158
info.indirect_offset = offset;
6159
info.count_buffer = count_buffer;
6160
info.count_buffer_offset = countBufferOffset;
6161
info.stride = stride;
6162
info.strmout_buffer = NULL;
6163
info.indexed = false;
6164
info.instance_count = 0;
6165
6166
if (!radv_before_draw(cmd_buffer, &info, 1))
6167
return;
6168
radv_emit_indirect_draw_packets(cmd_buffer, &info);
6169
radv_after_draw(cmd_buffer);
6170
}
6171
6172
void
6173
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
6174
VkDeviceSize offset, VkBuffer _countBuffer,
6175
VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
6176
uint32_t stride)
6177
{
6178
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6179
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6180
RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
6181
struct radv_draw_info info;
6182
6183
info.indexed = true;
6184
info.count = maxDrawCount;
6185
info.indirect = buffer;
6186
info.indirect_offset = offset;
6187
info.count_buffer = count_buffer;
6188
info.count_buffer_offset = countBufferOffset;
6189
info.stride = stride;
6190
info.strmout_buffer = NULL;
6191
info.instance_count = 0;
6192
6193
if (!radv_before_draw(cmd_buffer, &info, 1))
6194
return;
6195
radv_emit_indirect_draw_packets(cmd_buffer, &info);
6196
radv_after_draw(cmd_buffer);
6197
}
6198
6199
struct radv_dispatch_info {
6200
/**
6201
* Determine the layout of the grid (in block units) to be used.
6202
*/
6203
uint32_t blocks[3];
6204
6205
/**
6206
* A starting offset for the grid. If unaligned is set, the offset
6207
* must still be aligned.
6208
*/
6209
uint32_t offsets[3];
6210
/**
6211
* Whether it's an unaligned compute dispatch.
6212
*/
6213
bool unaligned;
6214
6215
/**
6216
* Indirect compute parameters resource.
6217
*/
6218
struct radv_buffer *indirect;
6219
uint64_t indirect_offset;
6220
};
6221
6222
static void
6223
radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
6224
const struct radv_dispatch_info *info)
6225
{
6226
struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
6227
unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
6228
struct radeon_winsys *ws = cmd_buffer->device->ws;
6229
bool predicating = cmd_buffer->state.predicating;
6230
struct radeon_cmdbuf *cs = cmd_buffer->cs;
6231
struct radv_userdata_info *loc;
6232
6233
radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
6234
6235
loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
6236
6237
ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
6238
6239
if (compute_shader->info.wave_size == 32) {
6240
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6241
dispatch_initiator |= S_00B800_CS_W32_EN(1);
6242
}
6243
6244
if (info->indirect) {
6245
uint64_t va = radv_buffer_get_va(info->indirect->bo);
6246
6247
va += info->indirect->offset + info->indirect_offset;
6248
6249
radv_cs_add_buffer(ws, cs, info->indirect->bo);
6250
6251
if (loc->sgpr_idx != -1) {
6252
for (unsigned i = 0; i < 3; ++i) {
6253
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
6254
radeon_emit(cs,
6255
COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
6256
radeon_emit(cs, (va + 4 * i));
6257
radeon_emit(cs, (va + 4 * i) >> 32);
6258
radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
6259
radeon_emit(cs, 0);
6260
}
6261
}
6262
6263
if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
6264
radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1));
6265
radeon_emit(cs, va);
6266
radeon_emit(cs, va >> 32);
6267
radeon_emit(cs, dispatch_initiator);
6268
} else {
6269
radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
6270
radeon_emit(cs, 1);
6271
radeon_emit(cs, va);
6272
radeon_emit(cs, va >> 32);
6273
6274
radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
6275
radeon_emit(cs, 0);
6276
radeon_emit(cs, dispatch_initiator);
6277
}
6278
} else {
6279
unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
6280
unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
6281
6282
if (info->unaligned) {
6283
unsigned *cs_block_size = compute_shader->info.cs.block_size;
6284
unsigned remainder[3];
6285
6286
/* If aligned, these should be an entire block size,
6287
* not 0.
6288
*/
6289
remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
6290
remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
6291
remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
6292
6293
blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
6294
blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
6295
blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
6296
6297
for (unsigned i = 0; i < 3; ++i) {
6298
assert(offsets[i] % cs_block_size[i] == 0);
6299
offsets[i] /= cs_block_size[i];
6300
}
6301
6302
radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
6303
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
6304
S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
6305
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
6306
S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
6307
radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
6308
S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
6309
6310
dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
6311
}
6312
6313
if (loc->sgpr_idx != -1) {
6314
assert(loc->num_sgprs == 3);
6315
6316
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
6317
radeon_emit(cs, blocks[0]);
6318
radeon_emit(cs, blocks[1]);
6319
radeon_emit(cs, blocks[2]);
6320
}
6321
6322
if (offsets[0] || offsets[1] || offsets[2]) {
6323
radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
6324
radeon_emit(cs, offsets[0]);
6325
radeon_emit(cs, offsets[1]);
6326
radeon_emit(cs, offsets[2]);
6327
6328
/* The blocks in the packet are not counts but end values. */
6329
for (unsigned i = 0; i < 3; ++i)
6330
blocks[i] += offsets[i];
6331
} else {
6332
dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
6333
}
6334
6335
radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
6336
radeon_emit(cs, blocks[0]);
6337
radeon_emit(cs, blocks[1]);
6338
radeon_emit(cs, blocks[2]);
6339
radeon_emit(cs, dispatch_initiator);
6340
}
6341
6342
assert(cmd_buffer->cs->cdw <= cdw_max);
6343
}
6344
6345
static void
6346
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
6347
struct radv_pipeline *pipeline,
6348
VkPipelineBindPoint bind_point)
6349
{
6350
radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point);
6351
radv_flush_constants(cmd_buffer,
6352
bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
6353
? RADV_RT_STAGE_BITS
6354
: VK_SHADER_STAGE_COMPUTE_BIT,
6355
pipeline, bind_point);
6356
}
6357
6358
static void
6359
radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
6360
struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
6361
{
6362
bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
6363
bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline;
6364
bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug &&
6365
info->blocks[0] * info->blocks[1] * info->blocks[2] > 256;
6366
6367
if (cs_regalloc_hang)
6368
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
6369
RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
6370
6371
if (cmd_buffer->state.flush_bits &
6372
(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
6373
RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
6374
/* If we have to wait for idle, set all states first, so that
6375
* all SET packets are processed in parallel with previous draw
6376
* calls. Then upload descriptors, set shader pointers, and
6377
* dispatch, and prefetch at the end. This ensures that the
6378
* time the CUs are idle is very short. (there are only SET_SH
6379
* packets between the wait and the draw)
6380
*/
6381
radv_emit_compute_pipeline(cmd_buffer, pipeline);
6382
si_emit_cache_flush(cmd_buffer);
6383
/* <-- CUs are idle here --> */
6384
6385
radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
6386
6387
radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
6388
/* <-- CUs are busy here --> */
6389
6390
/* Start prefetches after the dispatch has been started. Both
6391
* will run in parallel, but starting the dispatch first is
6392
* more important.
6393
*/
6394
if (has_prefetch && pipeline_is_dirty) {
6395
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
6396
}
6397
} else {
6398
/* If we don't wait for idle, start prefetches first, then set
6399
* states, and dispatch at the end.
6400
*/
6401
si_emit_cache_flush(cmd_buffer);
6402
6403
if (has_prefetch && pipeline_is_dirty) {
6404
radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
6405
}
6406
6407
radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
6408
6409
radv_emit_compute_pipeline(cmd_buffer, pipeline);
6410
radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
6411
}
6412
6413
if (pipeline_is_dirty) {
6414
/* Raytracing uses compute shaders but has separate bind points and pipelines.
6415
* So if we set compute userdata & shader registers we should dirty the raytracing
6416
* ones and the other way around.
6417
*
6418
* We only need to do this when the pipeline is dirty because when we switch between
6419
* the two we always need to switch pipelines.
6420
*/
6421
radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
6422
? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
6423
: VK_PIPELINE_BIND_POINT_COMPUTE);
6424
}
6425
6426
if (cs_regalloc_hang)
6427
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
6428
6429
radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
6430
}
6431
6432
static void
6433
radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
6434
{
6435
radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
6436
VK_PIPELINE_BIND_POINT_COMPUTE);
6437
}
6438
6439
void
6440
radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
6441
uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
6442
{
6443
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6444
struct radv_dispatch_info info = {0};
6445
6446
info.blocks[0] = x;
6447
info.blocks[1] = y;
6448
info.blocks[2] = z;
6449
6450
info.offsets[0] = base_x;
6451
info.offsets[1] = base_y;
6452
info.offsets[2] = base_z;
6453
radv_compute_dispatch(cmd_buffer, &info);
6454
}
6455
6456
void
6457
radv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
6458
{
6459
radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
6460
}
6461
6462
void
6463
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
6464
{
6465
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6466
RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
6467
struct radv_dispatch_info info = {0};
6468
6469
info.indirect = buffer;
6470
info.indirect_offset = offset;
6471
6472
radv_compute_dispatch(cmd_buffer, &info);
6473
}
6474
6475
void
6476
radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
6477
{
6478
struct radv_dispatch_info info = {0};
6479
6480
info.blocks[0] = x;
6481
info.blocks[1] = y;
6482
info.blocks[2] = z;
6483
info.unaligned = 1;
6484
6485
radv_compute_dispatch(cmd_buffer, &info);
6486
}
6487
6488
static void
6489
radv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
6490
{
6491
radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline,
6492
VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
6493
}
6494
6495
static bool
6496
radv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer,
6497
const VkStridedDeviceAddressRegionKHR *tables)
6498
{
6499
struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
6500
uint32_t base_reg;
6501
void *ptr;
6502
uint32_t *desc_ptr;
6503
uint32_t offset;
6504
6505
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr))
6506
return false;
6507
6508
/* For the descriptor format. */
6509
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
6510
6511
desc_ptr = ptr;
6512
for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) {
6513
uint32_t rsrc_word3 =
6514
S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6515
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6516
S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
6517
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED) | S_008F0C_RESOURCE_LEVEL(1);
6518
6519
desc_ptr[0] = tables[i].deviceAddress;
6520
desc_ptr[1] = S_008F04_BASE_ADDRESS_HI(tables[i].deviceAddress >> 32) |
6521
S_008F04_STRIDE(tables[i].stride);
6522
desc_ptr[2] = 0xffffffffu;
6523
desc_ptr[3] = rsrc_word3;
6524
}
6525
6526
uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
6527
struct radv_userdata_info *loc =
6528
radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
6529
if (loc->sgpr_idx == -1)
6530
return true;
6531
6532
base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE];
6533
radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
6534
false);
6535
return true;
6536
}
6537
6538
void
6539
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
6540
const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
6541
const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
6542
const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
6543
const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
6544
uint32_t width, uint32_t height, uint32_t depth)
6545
{
6546
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6547
struct radv_dispatch_info info = {0};
6548
6549
info.blocks[0] = width;
6550
info.blocks[1] = height;
6551
info.blocks[2] = depth;
6552
info.unaligned = 1;
6553
6554
const VkStridedDeviceAddressRegionKHR tables[] = {
6555
*pRaygenShaderBindingTable,
6556
*pMissShaderBindingTable,
6557
*pHitShaderBindingTable,
6558
*pCallableShaderBindingTable,
6559
};
6560
6561
if (!radv_rt_bind_tables(cmd_buffer, tables)) {
6562
return;
6563
}
6564
6565
radv_rt_dispatch(cmd_buffer, &info);
6566
}
6567
6568
void
6569
radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
6570
{
6571
vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
6572
vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
6573
6574
cmd_buffer->state.pass = NULL;
6575
cmd_buffer->state.subpass = NULL;
6576
cmd_buffer->state.attachments = NULL;
6577
cmd_buffer->state.framebuffer = NULL;
6578
cmd_buffer->state.subpass_sample_locs = NULL;
6579
}
6580
6581
void
6582
radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
6583
{
6584
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
6585
6586
radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
6587
6588
radv_cmd_buffer_end_subpass(cmd_buffer);
6589
6590
radv_cmd_buffer_end_render_pass(cmd_buffer);
6591
}
6592
6593
/*
6594
* For HTILE we have the following interesting clear words:
6595
* 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
6596
* 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
6597
* 0xfffffff0: Clear depth to 1.0
6598
* 0x00000000: Clear depth to 0.0
6599
*/
6600
static void
6601
radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6602
const VkImageSubresourceRange *range)
6603
{
6604
struct radv_cmd_state *state = &cmd_buffer->state;
6605
uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
6606
VkClearDepthStencilValue value = {0};
6607
struct radv_barrier_data barrier = {0};
6608
6609
barrier.layout_transitions.init_mask_ram = 1;
6610
radv_describe_layout_transition(cmd_buffer, &barrier);
6611
6612
/* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
6613
* in considering previous rendering work for WAW hazards. */
6614
state->flush_bits |=
6615
radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
6616
6617
if (image->planes[0].surface.has_stencil &&
6618
!(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
6619
/* Flush caches before performing a separate aspect initialization because it's a
6620
* read-modify-write operation.
6621
*/
6622
state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image);
6623
}
6624
6625
state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
6626
6627
radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
6628
6629
if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
6630
/* Initialize the TC-compat metada value to 0 because by
6631
* default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
6632
* need have to conditionally update its value when performing
6633
* a fast depth clear.
6634
*/
6635
radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
6636
}
6637
}
6638
6639
static void
6640
radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6641
VkImageLayout src_layout, bool src_render_loop,
6642
VkImageLayout dst_layout, bool dst_render_loop,
6643
unsigned src_queue_mask, unsigned dst_queue_mask,
6644
const VkImageSubresourceRange *range,
6645
struct radv_sample_locations_state *sample_locs)
6646
{
6647
struct radv_device *device = cmd_buffer->device;
6648
6649
if (!radv_htile_enabled(image, range->baseMipLevel))
6650
return;
6651
6652
if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
6653
radv_initialize_htile(cmd_buffer, image, range);
6654
} else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
6655
src_queue_mask) &&
6656
radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
6657
dst_queue_mask)) {
6658
radv_initialize_htile(cmd_buffer, image, range);
6659
} else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
6660
src_queue_mask) &&
6661
!radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
6662
dst_queue_mask)) {
6663
cmd_buffer->state.flush_bits |=
6664
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6665
6666
radv_decompress_depth_stencil(cmd_buffer, image, range, sample_locs);
6667
6668
cmd_buffer->state.flush_bits |=
6669
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6670
}
6671
}
6672
6673
static uint32_t
6674
radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6675
const VkImageSubresourceRange *range, uint32_t value)
6676
{
6677
struct radv_barrier_data barrier = {0};
6678
6679
barrier.layout_transitions.init_mask_ram = 1;
6680
radv_describe_layout_transition(cmd_buffer, &barrier);
6681
6682
return radv_clear_cmask(cmd_buffer, image, range, value);
6683
}
6684
6685
uint32_t
6686
radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6687
const VkImageSubresourceRange *range)
6688
{
6689
static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
6690
uint32_t log2_samples = util_logbase2(image->info.samples);
6691
uint32_t value = fmask_clear_values[log2_samples];
6692
struct radv_barrier_data barrier = {0};
6693
6694
barrier.layout_transitions.init_mask_ram = 1;
6695
radv_describe_layout_transition(cmd_buffer, &barrier);
6696
6697
return radv_clear_fmask(cmd_buffer, image, range, value);
6698
}
6699
6700
uint32_t
6701
radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6702
const VkImageSubresourceRange *range, uint32_t value)
6703
{
6704
struct radv_barrier_data barrier = {0};
6705
uint32_t flush_bits = 0;
6706
unsigned size = 0;
6707
6708
barrier.layout_transitions.init_mask_ram = 1;
6709
radv_describe_layout_transition(cmd_buffer, &barrier);
6710
6711
flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
6712
6713
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
6714
/* When DCC is enabled with mipmaps, some levels might not
6715
* support fast clears and we have to initialize them as "fully
6716
* expanded".
6717
*/
6718
/* Compute the size of all fast clearable DCC levels. */
6719
for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
6720
struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
6721
unsigned dcc_fast_clear_size =
6722
dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
6723
6724
if (!dcc_fast_clear_size)
6725
break;
6726
6727
size = dcc_level->dcc_offset + dcc_fast_clear_size;
6728
}
6729
6730
/* Initialize the mipmap levels without DCC. */
6731
if (size != image->planes[0].surface.meta_size) {
6732
flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo,
6733
image->offset + image->planes[0].surface.meta_offset + size,
6734
image->planes[0].surface.meta_size - size, 0xffffffff);
6735
}
6736
}
6737
6738
return flush_bits;
6739
}
6740
6741
/**
6742
* Initialize DCC/FMASK/CMASK metadata for a color image.
6743
*/
6744
static void
6745
radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6746
VkImageLayout src_layout, bool src_render_loop,
6747
VkImageLayout dst_layout, bool dst_render_loop,
6748
unsigned src_queue_mask, unsigned dst_queue_mask,
6749
const VkImageSubresourceRange *range)
6750
{
6751
uint32_t flush_bits = 0;
6752
6753
/* Transitioning from LAYOUT_UNDEFINED layout not everyone is
6754
* consistent in considering previous rendering work for WAW hazards.
6755
*/
6756
cmd_buffer->state.flush_bits |=
6757
radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image);
6758
6759
if (radv_image_has_cmask(image)) {
6760
uint32_t value;
6761
6762
if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
6763
/* TODO: Fix clearing CMASK layers on GFX9. */
6764
if (radv_image_is_tc_compat_cmask(image) ||
6765
(radv_image_has_fmask(image) &&
6766
radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
6767
dst_render_loop, dst_queue_mask))) {
6768
value = 0xccccccccu;
6769
} else {
6770
value = 0xffffffffu;
6771
}
6772
} else {
6773
static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
6774
uint32_t log2_samples = util_logbase2(image->info.samples);
6775
6776
value = cmask_clear_values[log2_samples];
6777
}
6778
6779
flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
6780
}
6781
6782
if (radv_image_has_fmask(image)) {
6783
flush_bits |= radv_init_fmask(cmd_buffer, image, range);
6784
}
6785
6786
if (radv_dcc_enabled(image, range->baseMipLevel)) {
6787
uint32_t value = 0xffffffffu; /* Fully expanded mode. */
6788
6789
if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
6790
dst_layout, dst_render_loop, dst_queue_mask)) {
6791
value = 0u;
6792
}
6793
6794
flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
6795
}
6796
6797
if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
6798
radv_update_fce_metadata(cmd_buffer, image, range, false);
6799
6800
uint32_t color_values[2] = {0};
6801
radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
6802
}
6803
6804
cmd_buffer->state.flush_bits |= flush_bits;
6805
}
6806
6807
static void
6808
radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6809
VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
6810
{
6811
if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
6812
(dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
6813
(dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
6814
radv_retile_dcc(cmd_buffer, image);
6815
}
6816
6817
static bool
6818
radv_image_need_retile(const struct radv_image *image)
6819
{
6820
return image->planes[0].surface.display_dcc_offset &&
6821
image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
6822
}
6823
6824
/**
6825
* Handle color image transitions for DCC/FMASK/CMASK.
6826
*/
6827
static void
6828
radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6829
VkImageLayout src_layout, bool src_render_loop,
6830
VkImageLayout dst_layout, bool dst_render_loop,
6831
unsigned src_queue_mask, unsigned dst_queue_mask,
6832
const VkImageSubresourceRange *range)
6833
{
6834
bool dcc_decompressed = false, fast_clear_flushed = false;
6835
6836
if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
6837
!radv_dcc_enabled(image, range->baseMipLevel))
6838
return;
6839
6840
if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
6841
radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
6842
dst_render_loop, src_queue_mask, dst_queue_mask, range);
6843
6844
if (radv_image_need_retile(image))
6845
radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
6846
return;
6847
}
6848
6849
if (radv_dcc_enabled(image, range->baseMipLevel)) {
6850
if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
6851
cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
6852
} else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
6853
src_layout, src_render_loop, src_queue_mask) &&
6854
!radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
6855
dst_layout, dst_render_loop, dst_queue_mask)) {
6856
radv_decompress_dcc(cmd_buffer, image, range);
6857
dcc_decompressed = true;
6858
} else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
6859
src_layout, src_render_loop, src_queue_mask) &&
6860
!radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
6861
dst_layout, dst_render_loop, dst_queue_mask)) {
6862
radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
6863
fast_clear_flushed = true;
6864
}
6865
6866
if (radv_image_need_retile(image))
6867
radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
6868
} else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
6869
if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
6870
src_layout, src_render_loop, src_queue_mask) &&
6871
!radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
6872
dst_layout, dst_render_loop, dst_queue_mask)) {
6873
radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
6874
fast_clear_flushed = true;
6875
}
6876
}
6877
6878
/* MSAA color decompress. */
6879
if (radv_image_has_fmask(image) &&
6880
(image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
6881
radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
6882
!radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
6883
if (radv_dcc_enabled(image, range->baseMipLevel) &&
6884
!radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
6885
/* A DCC decompress is required before expanding FMASK
6886
* when DCC stores aren't supported to avoid being in
6887
* a state where DCC is compressed and the main
6888
* surface is uncompressed.
6889
*/
6890
radv_decompress_dcc(cmd_buffer, image, range);
6891
} else if (!fast_clear_flushed) {
6892
/* A FMASK decompress is required before expanding
6893
* FMASK.
6894
*/
6895
radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
6896
}
6897
6898
struct radv_barrier_data barrier = {0};
6899
barrier.layout_transitions.fmask_color_expand = 1;
6900
radv_describe_layout_transition(cmd_buffer, &barrier);
6901
6902
radv_expand_fmask_image_inplace(cmd_buffer, image, range);
6903
}
6904
}
6905
6906
static void
6907
radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
6908
VkImageLayout src_layout, bool src_render_loop,
6909
VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family,
6910
uint32_t dst_family, const VkImageSubresourceRange *range,
6911
struct radv_sample_locations_state *sample_locs)
6912
{
6913
if (image->exclusive && src_family != dst_family) {
6914
/* This is an acquire or a release operation and there will be
6915
* a corresponding release/acquire. Do the transition in the
6916
* most flexible queue. */
6917
6918
assert(src_family == cmd_buffer->queue_family_index ||
6919
dst_family == cmd_buffer->queue_family_index);
6920
6921
if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
6922
return;
6923
6924
if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
6925
return;
6926
6927
if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
6928
(src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL))
6929
return;
6930
}
6931
6932
unsigned src_queue_mask =
6933
radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index);
6934
unsigned dst_queue_mask =
6935
radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index);
6936
6937
if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
6938
return;
6939
6940
if (vk_format_has_depth(image->vk_format)) {
6941
radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
6942
dst_render_loop, src_queue_mask, dst_queue_mask, range,
6943
sample_locs);
6944
} else {
6945
radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
6946
dst_render_loop, src_queue_mask, dst_queue_mask, range);
6947
}
6948
}
6949
6950
struct radv_barrier_info {
6951
enum rgp_barrier_reason reason;
6952
uint32_t eventCount;
6953
const VkEvent *pEvents;
6954
VkPipelineStageFlags srcStageMask;
6955
VkPipelineStageFlags dstStageMask;
6956
};
6957
6958
static void
6959
radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
6960
const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
6961
const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
6962
const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info)
6963
{
6964
struct radeon_cmdbuf *cs = cmd_buffer->cs;
6965
enum radv_cmd_flush_bits src_flush_bits = 0;
6966
enum radv_cmd_flush_bits dst_flush_bits = 0;
6967
6968
radv_describe_barrier_start(cmd_buffer, info->reason);
6969
6970
for (unsigned i = 0; i < info->eventCount; ++i) {
6971
RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
6972
uint64_t va = radv_buffer_get_va(event->bo);
6973
6974
radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
6975
6976
ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
6977
6978
radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
6979
assert(cmd_buffer->cs->cdw <= cdw_max);
6980
}
6981
6982
for (uint32_t i = 0; i < memoryBarrierCount; i++) {
6983
src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL);
6984
dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL);
6985
}
6986
6987
for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
6988
src_flush_bits |=
6989
radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL);
6990
dst_flush_bits |=
6991
radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL);
6992
}
6993
6994
for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
6995
RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
6996
6997
src_flush_bits |=
6998
radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image);
6999
dst_flush_bits |=
7000
radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image);
7001
}
7002
7003
/* The Vulkan spec 1.1.98 says:
7004
*
7005
* "An execution dependency with only
7006
* VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
7007
* will only prevent that stage from executing in subsequently
7008
* submitted commands. As this stage does not perform any actual
7009
* execution, this is not observable - in effect, it does not delay
7010
* processing of subsequent commands. Similarly an execution dependency
7011
* with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
7012
* will effectively not wait for any prior commands to complete."
7013
*/
7014
if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
7015
radv_stage_flush(cmd_buffer, info->srcStageMask);
7016
cmd_buffer->state.flush_bits |= src_flush_bits;
7017
7018
for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
7019
RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
7020
7021
const struct VkSampleLocationsInfoEXT *sample_locs_info =
7022
vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
7023
struct radv_sample_locations_state sample_locations = {0};
7024
7025
if (sample_locs_info) {
7026
assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
7027
sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
7028
sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
7029
sample_locations.count = sample_locs_info->sampleLocationsCount;
7030
typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
7031
sample_locs_info->sampleLocationsCount);
7032
}
7033
7034
radv_handle_image_transition(
7035
cmd_buffer, image, pImageMemoryBarriers[i].oldLayout,
7036
false, /* Outside of a renderpass we are never in a renderloop */
7037
pImageMemoryBarriers[i].newLayout,
7038
false, /* Outside of a renderpass we are never in a renderloop */
7039
pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex,
7040
&pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
7041
}
7042
7043
/* Make sure CP DMA is idle because the driver might have performed a
7044
* DMA operation for copying or filling buffers/images.
7045
*/
7046
if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7047
si_cp_dma_wait_for_idle(cmd_buffer);
7048
7049
cmd_buffer->state.flush_bits |= dst_flush_bits;
7050
7051
radv_describe_barrier_end(cmd_buffer);
7052
}
7053
7054
void
7055
radv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
7056
VkPipelineStageFlags destStageMask, VkBool32 byRegion,
7057
uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7058
uint32_t bufferMemoryBarrierCount,
7059
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7060
uint32_t imageMemoryBarrierCount,
7061
const VkImageMemoryBarrier *pImageMemoryBarriers)
7062
{
7063
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7064
struct radv_barrier_info info;
7065
7066
info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
7067
info.eventCount = 0;
7068
info.pEvents = NULL;
7069
info.srcStageMask = srcStageMask;
7070
info.dstStageMask = destStageMask;
7071
7072
radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7073
pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7074
}
7075
7076
static void
7077
write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
7078
VkPipelineStageFlags stageMask, unsigned value)
7079
{
7080
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7081
uint64_t va = radv_buffer_get_va(event->bo);
7082
7083
si_emit_cache_flush(cmd_buffer);
7084
7085
radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
7086
7087
ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
7088
7089
/* Flags that only require a top-of-pipe event. */
7090
VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
7091
7092
/* Flags that only require a post-index-fetch event. */
7093
VkPipelineStageFlags post_index_fetch_flags =
7094
top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
7095
7096
/* Flags that only require signaling post PS. */
7097
VkPipelineStageFlags post_ps_flags =
7098
post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
7099
VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
7100
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
7101
VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT |
7102
VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
7103
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
7104
7105
/* Flags that only require signaling post CS. */
7106
VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
7107
7108
/* Make sure CP DMA is idle because the driver might have performed a
7109
* DMA operation for copying or filling buffers/images.
7110
*/
7111
if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
7112
si_cp_dma_wait_for_idle(cmd_buffer);
7113
7114
if (!(stageMask & ~top_of_pipe_flags)) {
7115
/* Just need to sync the PFP engine. */
7116
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7117
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
7118
radeon_emit(cs, va);
7119
radeon_emit(cs, va >> 32);
7120
radeon_emit(cs, value);
7121
} else if (!(stageMask & ~post_index_fetch_flags)) {
7122
/* Sync ME because PFP reads index and indirect buffers. */
7123
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
7124
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
7125
radeon_emit(cs, va);
7126
radeon_emit(cs, va >> 32);
7127
radeon_emit(cs, value);
7128
} else {
7129
unsigned event_type;
7130
7131
if (!(stageMask & ~post_ps_flags)) {
7132
/* Sync previous fragment shaders. */
7133
event_type = V_028A90_PS_DONE;
7134
} else if (!(stageMask & ~post_cs_flags)) {
7135
/* Sync previous compute shaders. */
7136
event_type = V_028A90_CS_DONE;
7137
} else {
7138
/* Otherwise, sync all prior GPU work. */
7139
event_type = V_028A90_BOTTOM_OF_PIPE_TS;
7140
}
7141
7142
si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
7143
radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
7144
EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
7145
cmd_buffer->gfx9_eop_bug_va);
7146
}
7147
7148
assert(cmd_buffer->cs->cdw <= cdw_max);
7149
}
7150
7151
void
7152
radv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7153
{
7154
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7155
RADV_FROM_HANDLE(radv_event, event, _event);
7156
7157
write_event(cmd_buffer, event, stageMask, 1);
7158
}
7159
7160
void
7161
radv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
7162
{
7163
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7164
RADV_FROM_HANDLE(radv_event, event, _event);
7165
7166
write_event(cmd_buffer, event, stageMask, 0);
7167
}
7168
7169
void
7170
radv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
7171
VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
7172
uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
7173
uint32_t bufferMemoryBarrierCount,
7174
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
7175
uint32_t imageMemoryBarrierCount,
7176
const VkImageMemoryBarrier *pImageMemoryBarriers)
7177
{
7178
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7179
struct radv_barrier_info info;
7180
7181
info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS;
7182
info.eventCount = eventCount;
7183
info.pEvents = pEvents;
7184
info.srcStageMask = 0;
7185
7186
radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
7187
pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
7188
}
7189
7190
void
7191
radv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
7192
{
7193
/* No-op */
7194
}
7195
7196
/* VK_EXT_conditional_rendering */
7197
void
7198
radv_CmdBeginConditionalRenderingEXT(
7199
VkCommandBuffer commandBuffer,
7200
const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
7201
{
7202
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7203
RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
7204
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7205
unsigned pred_op = PREDICATION_OP_BOOL32;
7206
bool draw_visible = true;
7207
uint64_t va;
7208
7209
va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
7210
7211
/* By default, if the 32-bit value at offset in buffer memory is zero,
7212
* then the rendering commands are discarded, otherwise they are
7213
* executed as normal. If the inverted flag is set, all commands are
7214
* discarded if the value is non zero.
7215
*/
7216
if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
7217
draw_visible = false;
7218
}
7219
7220
si_emit_cache_flush(cmd_buffer);
7221
7222
if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL &&
7223
!cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
7224
uint64_t pred_value = 0, pred_va;
7225
unsigned pred_offset;
7226
7227
/* From the Vulkan spec 1.1.107:
7228
*
7229
* "If the 32-bit value at offset in buffer memory is zero,
7230
* then the rendering commands are discarded, otherwise they
7231
* are executed as normal. If the value of the predicate in
7232
* buffer memory changes while conditional rendering is
7233
* active, the rendering commands may be discarded in an
7234
* implementation-dependent way. Some implementations may
7235
* latch the value of the predicate upon beginning conditional
7236
* rendering while others may read it before every rendering
7237
* command."
7238
*
7239
* But, the AMD hardware treats the predicate as a 64-bit
7240
* value which means we need a workaround in the driver.
7241
* Luckily, it's not required to support if the value changes
7242
* when predication is active.
7243
*
7244
* The workaround is as follows:
7245
* 1) allocate a 64-value in the upload BO and initialize it
7246
* to 0
7247
* 2) copy the 32-bit predicate value to the upload BO
7248
* 3) use the new allocated VA address for predication
7249
*
7250
* Based on the conditionalrender demo, it's faster to do the
7251
* COPY_DATA in ME (+ sync PFP) instead of PFP.
7252
*/
7253
radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
7254
7255
pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
7256
7257
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
7258
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7259
COPY_DATA_WR_CONFIRM);
7260
radeon_emit(cs, va);
7261
radeon_emit(cs, va >> 32);
7262
radeon_emit(cs, pred_va);
7263
radeon_emit(cs, pred_va >> 32);
7264
7265
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
7266
radeon_emit(cs, 0);
7267
7268
va = pred_va;
7269
pred_op = PREDICATION_OP_BOOL64;
7270
}
7271
7272
/* Enable predication for this command buffer. */
7273
si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
7274
cmd_buffer->state.predicating = true;
7275
7276
/* Store conditional rendering user info. */
7277
cmd_buffer->state.predication_type = draw_visible;
7278
cmd_buffer->state.predication_op = pred_op;
7279
cmd_buffer->state.predication_va = va;
7280
}
7281
7282
void
7283
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
7284
{
7285
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7286
7287
/* Disable predication for this command buffer. */
7288
si_emit_set_predication_state(cmd_buffer, false, 0, 0);
7289
cmd_buffer->state.predicating = false;
7290
7291
/* Reset conditional rendering user info. */
7292
cmd_buffer->state.predication_type = -1;
7293
cmd_buffer->state.predication_op = 0;
7294
cmd_buffer->state.predication_va = 0;
7295
}
7296
7297
/* VK_EXT_transform_feedback */
7298
void
7299
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
7300
uint32_t bindingCount, const VkBuffer *pBuffers,
7301
const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
7302
{
7303
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7304
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
7305
uint8_t enabled_mask = 0;
7306
7307
assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
7308
for (uint32_t i = 0; i < bindingCount; i++) {
7309
uint32_t idx = firstBinding + i;
7310
7311
sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
7312
sb[idx].offset = pOffsets[i];
7313
7314
if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
7315
sb[idx].size = sb[idx].buffer->size - sb[idx].offset;
7316
} else {
7317
sb[idx].size = pSizes[i];
7318
}
7319
7320
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
7321
7322
enabled_mask |= 1 << idx;
7323
}
7324
7325
cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
7326
7327
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
7328
}
7329
7330
static void
7331
radv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
7332
{
7333
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7334
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7335
7336
radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
7337
radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) |
7338
S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
7339
S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
7340
S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
7341
radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask);
7342
7343
cmd_buffer->state.context_roll_without_scissor_emitted = true;
7344
}
7345
7346
static void
7347
radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
7348
{
7349
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7350
bool old_streamout_enabled = so->streamout_enabled;
7351
uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
7352
7353
so->streamout_enabled = enable;
7354
7355
so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
7356
(so->enabled_mask << 12);
7357
7358
if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
7359
((old_streamout_enabled != so->streamout_enabled) ||
7360
(old_hw_enabled_mask != so->hw_enabled_mask)))
7361
radv_emit_streamout_enable(cmd_buffer);
7362
7363
if (cmd_buffer->device->physical_device->use_ngg_streamout) {
7364
cmd_buffer->gds_needed = true;
7365
cmd_buffer->gds_oa_needed = true;
7366
}
7367
}
7368
7369
static void
7370
radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
7371
{
7372
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7373
unsigned reg_strmout_cntl;
7374
7375
/* The register is at different places on different ASICs. */
7376
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
7377
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
7378
radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
7379
} else {
7380
reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
7381
radeon_set_config_reg(cs, reg_strmout_cntl, 0);
7382
}
7383
7384
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
7385
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
7386
7387
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
7388
radeon_emit(cs,
7389
WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
7390
radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
7391
radeon_emit(cs, 0);
7392
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
7393
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
7394
radeon_emit(cs, 4); /* poll interval */
7395
}
7396
7397
static void
7398
radv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
7399
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7400
const VkDeviceSize *pCounterBufferOffsets)
7401
7402
{
7403
struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
7404
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7405
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7406
7407
radv_flush_vgt_streamout(cmd_buffer);
7408
7409
assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
7410
u_foreach_bit(i, so->enabled_mask)
7411
{
7412
int32_t counter_buffer_idx = i - firstCounterBuffer;
7413
if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
7414
counter_buffer_idx = -1;
7415
7416
/* AMD GCN binds streamout buffers as shader resources.
7417
* VGT only counts primitives and tells the shader through
7418
* SGPRs what to do.
7419
*/
7420
radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
7421
radeon_emit(cs, sb[i].size >> 2); /* BUFFER_SIZE (in DW) */
7422
radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
7423
7424
cmd_buffer->state.context_roll_without_scissor_emitted = true;
7425
7426
if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
7427
/* The array of counter buffers is optional. */
7428
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
7429
uint64_t va = radv_buffer_get_va(buffer->bo);
7430
uint64_t counter_buffer_offset = 0;
7431
7432
if (pCounterBufferOffsets)
7433
counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
7434
7435
va += buffer->offset + counter_buffer_offset;
7436
7437
/* Append */
7438
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
7439
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
7440
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
7441
radeon_emit(cs, 0); /* unused */
7442
radeon_emit(cs, 0); /* unused */
7443
radeon_emit(cs, va); /* src address lo */
7444
radeon_emit(cs, va >> 32); /* src address hi */
7445
7446
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
7447
} else {
7448
/* Start from the beginning. */
7449
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
7450
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
7451
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
7452
radeon_emit(cs, 0); /* unused */
7453
radeon_emit(cs, 0); /* unused */
7454
radeon_emit(cs, 0); /* unused */
7455
radeon_emit(cs, 0); /* unused */
7456
}
7457
}
7458
7459
radv_set_streamout_enable(cmd_buffer, true);
7460
}
7461
7462
static void
7463
gfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
7464
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7465
const VkDeviceSize *pCounterBufferOffsets)
7466
{
7467
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7468
unsigned last_target = util_last_bit(so->enabled_mask) - 1;
7469
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7470
7471
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
7472
assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
7473
7474
/* Sync because the next streamout operation will overwrite GDS and we
7475
* have to make sure it's idle.
7476
* TODO: Improve by tracking if there is a streamout operation in
7477
* flight.
7478
*/
7479
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
7480
si_emit_cache_flush(cmd_buffer);
7481
7482
u_foreach_bit(i, so->enabled_mask)
7483
{
7484
int32_t counter_buffer_idx = i - firstCounterBuffer;
7485
if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
7486
counter_buffer_idx = -1;
7487
7488
bool append =
7489
counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
7490
uint64_t va = 0;
7491
7492
if (append) {
7493
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
7494
uint64_t counter_buffer_offset = 0;
7495
7496
if (pCounterBufferOffsets)
7497
counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
7498
7499
va += radv_buffer_get_va(buffer->bo);
7500
va += buffer->offset + counter_buffer_offset;
7501
7502
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
7503
}
7504
7505
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
7506
radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
7507
S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
7508
radeon_emit(cs, va);
7509
radeon_emit(cs, va >> 32);
7510
radeon_emit(cs, 4 * i); /* destination in GDS */
7511
radeon_emit(cs, 0);
7512
radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
7513
}
7514
7515
radv_set_streamout_enable(cmd_buffer, true);
7516
}
7517
7518
void
7519
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
7520
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7521
const VkDeviceSize *pCounterBufferOffsets)
7522
{
7523
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7524
7525
if (cmd_buffer->device->physical_device->use_ngg_streamout) {
7526
gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
7527
pCounterBuffers, pCounterBufferOffsets);
7528
} else {
7529
radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
7530
pCounterBufferOffsets);
7531
}
7532
}
7533
7534
static void
7535
radv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
7536
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7537
const VkDeviceSize *pCounterBufferOffsets)
7538
{
7539
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7540
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7541
7542
radv_flush_vgt_streamout(cmd_buffer);
7543
7544
assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
7545
u_foreach_bit(i, so->enabled_mask)
7546
{
7547
int32_t counter_buffer_idx = i - firstCounterBuffer;
7548
if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
7549
counter_buffer_idx = -1;
7550
7551
if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
7552
/* The array of counters buffer is optional. */
7553
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
7554
uint64_t va = radv_buffer_get_va(buffer->bo);
7555
uint64_t counter_buffer_offset = 0;
7556
7557
if (pCounterBufferOffsets)
7558
counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
7559
7560
va += buffer->offset + counter_buffer_offset;
7561
7562
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
7563
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
7564
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
7565
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
7566
radeon_emit(cs, va); /* dst address lo */
7567
radeon_emit(cs, va >> 32); /* dst address hi */
7568
radeon_emit(cs, 0); /* unused */
7569
radeon_emit(cs, 0); /* unused */
7570
7571
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
7572
}
7573
7574
/* Deactivate transform feedback by zeroing the buffer size.
7575
* The counters (primitives generated, primitives emitted) may
7576
* be enabled even if there is not buffer bound. This ensures
7577
* that the primitives-emitted query won't increment.
7578
*/
7579
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
7580
7581
cmd_buffer->state.context_roll_without_scissor_emitted = true;
7582
}
7583
7584
radv_set_streamout_enable(cmd_buffer, false);
7585
}
7586
7587
static void
7588
gfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
7589
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7590
const VkDeviceSize *pCounterBufferOffsets)
7591
{
7592
struct radv_streamout_state *so = &cmd_buffer->state.streamout;
7593
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7594
7595
assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
7596
assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
7597
7598
u_foreach_bit(i, so->enabled_mask)
7599
{
7600
int32_t counter_buffer_idx = i - firstCounterBuffer;
7601
if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
7602
counter_buffer_idx = -1;
7603
7604
if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
7605
/* The array of counters buffer is optional. */
7606
RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
7607
uint64_t va = radv_buffer_get_va(buffer->bo);
7608
uint64_t counter_buffer_offset = 0;
7609
7610
if (pCounterBufferOffsets)
7611
counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
7612
7613
va += buffer->offset + counter_buffer_offset;
7614
7615
si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
7616
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
7617
EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
7618
7619
radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
7620
}
7621
}
7622
7623
radv_set_streamout_enable(cmd_buffer, false);
7624
}
7625
7626
void
7627
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
7628
uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
7629
const VkDeviceSize *pCounterBufferOffsets)
7630
{
7631
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7632
7633
if (cmd_buffer->device->physical_device->use_ngg_streamout) {
7634
gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
7635
pCounterBufferOffsets);
7636
} else {
7637
radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
7638
pCounterBufferOffsets);
7639
}
7640
}
7641
7642
void
7643
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
7644
uint32_t firstInstance, VkBuffer _counterBuffer,
7645
VkDeviceSize counterBufferOffset, uint32_t counterOffset,
7646
uint32_t vertexStride)
7647
{
7648
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7649
RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
7650
struct radv_draw_info info;
7651
7652
info.count = 0;
7653
info.instance_count = instanceCount;
7654
info.first_instance = firstInstance;
7655
info.strmout_buffer = counterBuffer;
7656
info.strmout_buffer_offset = counterBufferOffset;
7657
info.stride = vertexStride;
7658
info.indexed = false;
7659
info.indirect = NULL;
7660
7661
if (!radv_before_draw(cmd_buffer, &info, 1))
7662
return;
7663
struct VkMultiDrawInfoEXT minfo = { 0, 0 };
7664
radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
7665
radv_after_draw(cmd_buffer);
7666
}
7667
7668
/* VK_AMD_buffer_marker */
7669
void
7670
radv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage,
7671
VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
7672
{
7673
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7674
RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
7675
struct radeon_cmdbuf *cs = cmd_buffer->cs;
7676
uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
7677
7678
si_emit_cache_flush(cmd_buffer);
7679
7680
ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
7681
7682
if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
7683
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
7684
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
7685
COPY_DATA_WR_CONFIRM);
7686
radeon_emit(cs, marker);
7687
radeon_emit(cs, 0);
7688
radeon_emit(cs, va);
7689
radeon_emit(cs, va >> 32);
7690
} else {
7691
si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
7692
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
7693
0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
7694
cmd_buffer->gfx9_eop_bug_va);
7695
}
7696
7697
assert(cmd_buffer->cs->cdw <= cdw_max);
7698
}
7699
7700