Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/vulkan/tu_cmd_buffer.c
4565 views
1
/*
2
* Copyright © 2016 Red Hat.
3
* Copyright © 2016 Bas Nieuwenhuizen
4
*
5
* based in part on anv driver which is:
6
* Copyright © 2015 Intel Corporation
7
*
8
* Permission is hereby granted, free of charge, to any person obtaining a
9
* copy of this software and associated documentation files (the "Software"),
10
* to deal in the Software without restriction, including without limitation
11
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
12
* and/or sell copies of the Software, and to permit persons to whom the
13
* Software is furnished to do so, subject to the following conditions:
14
*
15
* The above copyright notice and this permission notice (including the next
16
* paragraph) shall be included in all copies or substantial portions of the
17
* Software.
18
*
19
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
* DEALINGS IN THE SOFTWARE.
26
*/
27
28
#include "tu_private.h"
29
30
#include "adreno_pm4.xml.h"
31
#include "adreno_common.xml.h"
32
33
#include "vk_format.h"
34
#include "vk_util.h"
35
36
#include "tu_cs.h"
37
38
void
39
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
40
struct tu_cs *cs,
41
enum vgt_event_type event)
42
{
43
bool need_seqno = false;
44
switch (event) {
45
case CACHE_FLUSH_TS:
46
case WT_DONE_TS:
47
case RB_DONE_TS:
48
case PC_CCU_FLUSH_DEPTH_TS:
49
case PC_CCU_FLUSH_COLOR_TS:
50
case PC_CCU_RESOLVE_TS:
51
need_seqno = true;
52
break;
53
default:
54
break;
55
}
56
57
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
58
tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
59
if (need_seqno) {
60
tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
61
tu_cs_emit(cs, 0);
62
}
63
}
64
65
static void
66
tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
67
struct tu_cs *cs,
68
enum tu_cmd_flush_bits flushes)
69
{
70
/* Experiments show that invalidating CCU while it still has data in it
71
* doesn't work, so make sure to always flush before invalidating in case
72
* any data remains that hasn't yet been made available through a barrier.
73
* However it does seem to work for UCHE.
74
*/
75
if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
76
TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
77
tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
78
if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
79
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
80
tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
81
if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
82
tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
83
if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
84
tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
85
if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
86
tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
87
if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
88
tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
89
if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
90
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
91
if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
92
tu_cs_emit_wfi(cs);
93
if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
94
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
95
}
96
97
/* "Normal" cache flushes, that don't require any special handling */
98
99
static void
100
tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
101
struct tu_cs *cs)
102
{
103
tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
104
cmd_buffer->state.cache.flush_bits = 0;
105
}
106
107
/* Renderpass cache flushes */
108
109
void
110
tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
111
struct tu_cs *cs)
112
{
113
if (!cmd_buffer->state.renderpass_cache.flush_bits)
114
return;
115
tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
116
cmd_buffer->state.renderpass_cache.flush_bits = 0;
117
}
118
119
/* Cache flushes for things that use the color/depth read/write path (i.e.
120
* blits and draws). This deals with changing CCU state as well as the usual
121
* cache flushing.
122
*/
123
124
void
125
tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
126
struct tu_cs *cs,
127
enum tu_cmd_ccu_state ccu_state)
128
{
129
enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
130
131
assert(ccu_state != TU_CMD_CCU_UNKNOWN);
132
133
/* Changing CCU state must involve invalidating the CCU. In sysmem mode,
134
* the CCU may also contain data that we haven't flushed out yet, so we
135
* also need to flush. Also, in order to program RB_CCU_CNTL, we need to
136
* emit a WFI as it isn't pipelined.
137
*/
138
if (ccu_state != cmd_buffer->state.ccu_state) {
139
if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
140
flushes |=
141
TU_CMD_FLAG_CCU_FLUSH_COLOR |
142
TU_CMD_FLAG_CCU_FLUSH_DEPTH;
143
cmd_buffer->state.cache.pending_flush_bits &= ~(
144
TU_CMD_FLAG_CCU_FLUSH_COLOR |
145
TU_CMD_FLAG_CCU_FLUSH_DEPTH);
146
}
147
flushes |=
148
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
149
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
150
TU_CMD_FLAG_WAIT_FOR_IDLE;
151
cmd_buffer->state.cache.pending_flush_bits &= ~(
152
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
153
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
154
TU_CMD_FLAG_WAIT_FOR_IDLE);
155
}
156
157
tu6_emit_flushes(cmd_buffer, cs, flushes);
158
cmd_buffer->state.cache.flush_bits = 0;
159
160
if (ccu_state != cmd_buffer->state.ccu_state) {
161
struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
162
tu_cs_emit_regs(cs,
163
A6XX_RB_CCU_CNTL(.color_offset =
164
ccu_state == TU_CMD_CCU_GMEM ?
165
phys_dev->ccu_offset_gmem :
166
phys_dev->ccu_offset_bypass,
167
.gmem = ccu_state == TU_CMD_CCU_GMEM));
168
cmd_buffer->state.ccu_state = ccu_state;
169
}
170
}
171
172
static void
173
tu6_emit_zs(struct tu_cmd_buffer *cmd,
174
const struct tu_subpass *subpass,
175
struct tu_cs *cs)
176
{
177
const struct tu_framebuffer *fb = cmd->state.framebuffer;
178
179
const uint32_t a = subpass->depth_stencil_attachment.attachment;
180
if (a == VK_ATTACHMENT_UNUSED) {
181
tu_cs_emit_regs(cs,
182
A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
183
A6XX_RB_DEPTH_BUFFER_PITCH(0),
184
A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
185
A6XX_RB_DEPTH_BUFFER_BASE(0),
186
A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
187
188
tu_cs_emit_regs(cs,
189
A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
190
191
tu_cs_emit_regs(cs,
192
A6XX_GRAS_LRZ_BUFFER_BASE(0),
193
A6XX_GRAS_LRZ_BUFFER_PITCH(0),
194
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
195
196
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
197
198
return;
199
}
200
201
const struct tu_image_view *iview = fb->attachments[a].attachment;
202
const struct tu_render_pass_attachment *attachment =
203
&cmd->state.pass->attachments[a];
204
enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
205
206
tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
207
tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
208
tu_cs_image_ref(cs, iview, 0);
209
tu_cs_emit(cs, attachment->gmem_offset);
210
211
tu_cs_emit_regs(cs,
212
A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
213
214
tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
215
tu_cs_image_flag_ref(cs, iview, 0);
216
217
tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo,
218
.bo_offset = iview->image->bo_offset + iview->image->lrz_offset),
219
A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),
220
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
221
222
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
223
attachment->format == VK_FORMAT_S8_UINT) {
224
225
tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
226
tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
227
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
228
tu_cs_image_stencil_ref(cs, iview, 0);
229
tu_cs_emit(cs, attachment->gmem_offset_stencil);
230
} else {
231
tu_cs_image_ref(cs, iview, 0);
232
tu_cs_emit(cs, attachment->gmem_offset);
233
}
234
} else {
235
tu_cs_emit_regs(cs,
236
A6XX_RB_STENCIL_INFO(0));
237
}
238
}
239
240
static void
241
tu6_emit_mrt(struct tu_cmd_buffer *cmd,
242
const struct tu_subpass *subpass,
243
struct tu_cs *cs)
244
{
245
const struct tu_framebuffer *fb = cmd->state.framebuffer;
246
247
for (uint32_t i = 0; i < subpass->color_count; ++i) {
248
uint32_t a = subpass->color_attachments[i].attachment;
249
if (a == VK_ATTACHMENT_UNUSED)
250
continue;
251
252
const struct tu_image_view *iview = fb->attachments[a].attachment;
253
254
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
255
tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
256
tu_cs_image_ref(cs, iview, 0);
257
tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
258
259
tu_cs_emit_regs(cs,
260
A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
261
262
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);
263
tu_cs_image_flag_ref(cs, iview, 0);
264
}
265
266
tu_cs_emit_regs(cs,
267
A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
268
tu_cs_emit_regs(cs,
269
A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
270
271
unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
272
tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
273
}
274
275
void
276
tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
277
{
278
const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
279
bool msaa_disable = samples == MSAA_ONE;
280
281
tu_cs_emit_regs(cs,
282
A6XX_SP_TP_RAS_MSAA_CNTL(samples),
283
A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
284
.msaa_disable = msaa_disable));
285
286
tu_cs_emit_regs(cs,
287
A6XX_GRAS_RAS_MSAA_CNTL(samples),
288
A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
289
.msaa_disable = msaa_disable));
290
291
tu_cs_emit_regs(cs,
292
A6XX_RB_RAS_MSAA_CNTL(samples),
293
A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
294
.msaa_disable = msaa_disable));
295
296
tu_cs_emit_regs(cs,
297
A6XX_RB_MSAA_CNTL(samples));
298
}
299
300
static void
301
tu6_emit_bin_size(struct tu_cs *cs,
302
uint32_t bin_w, uint32_t bin_h, uint32_t flags)
303
{
304
tu_cs_emit_regs(cs,
305
A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
306
.binh = bin_h,
307
.dword = flags));
308
309
tu_cs_emit_regs(cs,
310
A6XX_RB_BIN_CONTROL(.binw = bin_w,
311
.binh = bin_h,
312
.dword = flags));
313
314
/* no flag for RB_BIN_CONTROL2... */
315
tu_cs_emit_regs(cs,
316
A6XX_RB_BIN_CONTROL2(.binw = bin_w,
317
.binh = bin_h));
318
}
319
320
static void
321
tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
322
const struct tu_subpass *subpass,
323
struct tu_cs *cs,
324
bool binning)
325
{
326
const struct tu_framebuffer *fb = cmd->state.framebuffer;
327
/* doesn't RB_RENDER_CNTL set differently for binning pass: */
328
bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
329
uint32_t cntl = 0;
330
cntl |= A6XX_RB_RENDER_CNTL_UNK4;
331
if (binning) {
332
if (no_track)
333
return;
334
cntl |= A6XX_RB_RENDER_CNTL_BINNING;
335
} else {
336
uint32_t mrts_ubwc_enable = 0;
337
for (uint32_t i = 0; i < subpass->color_count; ++i) {
338
uint32_t a = subpass->color_attachments[i].attachment;
339
if (a == VK_ATTACHMENT_UNUSED)
340
continue;
341
342
const struct tu_image_view *iview = fb->attachments[a].attachment;
343
if (iview->ubwc_enabled)
344
mrts_ubwc_enable |= 1 << i;
345
}
346
347
cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
348
349
const uint32_t a = subpass->depth_stencil_attachment.attachment;
350
if (a != VK_ATTACHMENT_UNUSED) {
351
const struct tu_image_view *iview = fb->attachments[a].attachment;
352
if (iview->ubwc_enabled)
353
cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
354
}
355
356
if (no_track) {
357
tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);
358
tu_cs_emit(cs, cntl);
359
return;
360
}
361
362
/* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
363
* in order to set it correctly for the different subpasses. However,
364
* that means the packets we're emitting also happen during binning. So
365
* we need to guard the write on !BINNING at CP execution time.
366
*/
367
tu_cs_reserve(cs, 3 + 4);
368
tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
369
tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
370
CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
371
tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
372
}
373
374
tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
375
tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
376
tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
377
tu_cs_emit(cs, cntl);
378
}
379
380
static void
381
tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
382
{
383
struct tu_physical_device *phys_dev = cmd->device->physical_device;
384
const VkRect2D *render_area = &cmd->state.render_area;
385
386
/* Avoid assertion fails with an empty render area at (0, 0) where the
387
* subtraction below wraps around. Empty render areas should be forced to
388
* the sysmem path by use_sysmem_rendering(). It's not even clear whether
389
* an empty scissor here works, and the blob seems to force sysmem too as
390
* it sets something wrong (non-empty) for the scissor.
391
*/
392
if (render_area->extent.width == 0 ||
393
render_area->extent.height == 0)
394
return;
395
396
uint32_t x1 = render_area->offset.x;
397
uint32_t y1 = render_area->offset.y;
398
uint32_t x2 = x1 + render_area->extent.width - 1;
399
uint32_t y2 = y1 + render_area->extent.height - 1;
400
401
if (align) {
402
x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);
403
y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);
404
x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;
405
y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
406
}
407
408
tu_cs_emit_regs(cs,
409
A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
410
A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
411
}
412
413
void
414
tu6_emit_window_scissor(struct tu_cs *cs,
415
uint32_t x1,
416
uint32_t y1,
417
uint32_t x2,
418
uint32_t y2)
419
{
420
tu_cs_emit_regs(cs,
421
A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
422
A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
423
424
tu_cs_emit_regs(cs,
425
A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
426
A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
427
}
428
429
void
430
tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
431
{
432
tu_cs_emit_regs(cs,
433
A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
434
435
tu_cs_emit_regs(cs,
436
A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
437
438
tu_cs_emit_regs(cs,
439
A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
440
441
tu_cs_emit_regs(cs,
442
A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
443
}
444
445
static void
446
tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
447
{
448
uint32_t enable_mask;
449
switch (id) {
450
case TU_DRAW_STATE_PROGRAM:
451
case TU_DRAW_STATE_VI:
452
case TU_DRAW_STATE_FS_CONST:
453
/* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
454
* when resources would actually be used in the binning shader.
455
* Presumably the overhead of prefetching the resources isn't
456
* worth it.
457
*/
458
case TU_DRAW_STATE_DESC_SETS_LOAD:
459
enable_mask = CP_SET_DRAW_STATE__0_GMEM |
460
CP_SET_DRAW_STATE__0_SYSMEM;
461
break;
462
case TU_DRAW_STATE_PROGRAM_BINNING:
463
case TU_DRAW_STATE_VI_BINNING:
464
enable_mask = CP_SET_DRAW_STATE__0_BINNING;
465
break;
466
case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
467
enable_mask = CP_SET_DRAW_STATE__0_GMEM;
468
break;
469
case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
470
enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
471
break;
472
default:
473
enable_mask = CP_SET_DRAW_STATE__0_GMEM |
474
CP_SET_DRAW_STATE__0_SYSMEM |
475
CP_SET_DRAW_STATE__0_BINNING;
476
break;
477
}
478
479
STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);
480
481
/* We need to reload the descriptors every time the descriptor sets
482
* change. However, the commands we send only depend on the pipeline
483
* because the whole point is to cache descriptors which are used by the
484
* pipeline. There's a problem here, in that the firmware has an
485
* "optimization" which skips executing groups that are set to the same
486
* value as the last draw. This means that if the descriptor sets change
487
* but not the pipeline, we'd try to re-execute the same buffer which
488
* the firmware would ignore and we wouldn't pre-load the new
489
* descriptors. Set the DIRTY bit to avoid this optimization
490
*/
491
if (id == TU_DRAW_STATE_DESC_SETS_LOAD)
492
enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
493
494
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
495
enable_mask |
496
CP_SET_DRAW_STATE__0_GROUP_ID(id) |
497
COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));
498
tu_cs_emit_qw(cs, state.iova);
499
}
500
501
static bool
502
use_hw_binning(struct tu_cmd_buffer *cmd)
503
{
504
const struct tu_framebuffer *fb = cmd->state.framebuffer;
505
506
/* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible
507
* with non-hw binning GMEM rendering. this is required because some of the
508
* XFB commands need to only be executed once
509
*/
510
if (cmd->state.xfb_used)
511
return true;
512
513
if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
514
return false;
515
516
if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
517
return true;
518
519
return (fb->tile_count.width * fb->tile_count.height) > 2;
520
}
521
522
static bool
523
use_sysmem_rendering(struct tu_cmd_buffer *cmd)
524
{
525
if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
526
return true;
527
528
/* can't fit attachments into gmem */
529
if (!cmd->state.pass->gmem_pixels)
530
return true;
531
532
if (cmd->state.framebuffer->layers > 1)
533
return true;
534
535
/* Use sysmem for empty render areas */
536
if (cmd->state.render_area.extent.width == 0 ||
537
cmd->state.render_area.extent.height == 0)
538
return true;
539
540
if (cmd->state.has_tess)
541
return true;
542
543
return false;
544
}
545
546
static void
547
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
548
struct tu_cs *cs,
549
uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
550
{
551
const struct tu_framebuffer *fb = cmd->state.framebuffer;
552
553
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
554
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
555
556
const uint32_t x1 = fb->tile0.width * tx;
557
const uint32_t y1 = fb->tile0.height * ty;
558
const uint32_t x2 = x1 + fb->tile0.width - 1;
559
const uint32_t y2 = y1 + fb->tile0.height - 1;
560
tu6_emit_window_scissor(cs, x1, y1, x2, y2);
561
tu6_emit_window_offset(cs, x1, y1);
562
563
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
564
565
if (use_hw_binning(cmd)) {
566
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
567
568
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
569
tu_cs_emit(cs, 0x0);
570
571
tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
572
tu_cs_emit(cs, fb->pipe_sizes[pipe] |
573
CP_SET_BIN_DATA5_0_VSC_N(slot));
574
tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
575
tu_cs_emit(cs, pipe * 4);
576
tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
577
578
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
579
tu_cs_emit(cs, 0x0);
580
581
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
582
tu_cs_emit(cs, 0x0);
583
} else {
584
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
585
tu_cs_emit(cs, 0x1);
586
587
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
588
tu_cs_emit(cs, 0x0);
589
}
590
}
591
592
static void
593
tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
594
struct tu_cs *cs,
595
uint32_t layer_mask,
596
uint32_t a,
597
uint32_t gmem_a)
598
{
599
const struct tu_framebuffer *fb = cmd->state.framebuffer;
600
struct tu_image_view *dst = fb->attachments[a].attachment;
601
struct tu_image_view *src = fb->attachments[gmem_a].attachment;
602
603
tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
604
}
605
606
static void
607
tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
608
struct tu_cs *cs,
609
const struct tu_subpass *subpass)
610
{
611
if (subpass->resolve_attachments) {
612
/* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
613
* Commands":
614
*
615
* End-of-subpass multisample resolves are treated as color
616
* attachment writes for the purposes of synchronization.
617
* This applies to resolve operations for both color and
618
* depth/stencil attachments. That is, they are considered to
619
* execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
620
* pipeline stage and their writes are synchronized with
621
* VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
622
* rendering within a subpass and any resolve operations at the end
623
* of the subpass occurs automatically, without need for explicit
624
* dependencies or pipeline barriers. However, if the resolve
625
* attachment is also used in a different subpass, an explicit
626
* dependency is needed.
627
*
628
* We use the CP_BLIT path for sysmem resolves, which is really a
629
* transfer command, so we have to manually flush similar to the gmem
630
* resolve case. However, a flush afterwards isn't needed because of the
631
* last sentence and the fact that we're in sysmem mode.
632
*/
633
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
634
if (subpass->resolve_depth_stencil)
635
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
636
637
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
638
639
/* Wait for the flushes to land before using the 2D engine */
640
tu_cs_emit_wfi(cs);
641
642
for (unsigned i = 0; i < subpass->resolve_count; i++) {
643
uint32_t a = subpass->resolve_attachments[i].attachment;
644
if (a == VK_ATTACHMENT_UNUSED)
645
continue;
646
647
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
648
649
tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a);
650
}
651
}
652
}
653
654
static void
655
tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
656
{
657
const struct tu_render_pass *pass = cmd->state.pass;
658
const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
659
660
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
661
tu_cs_emit(cs, 0x0);
662
663
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
664
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
665
666
tu6_emit_blit_scissor(cmd, cs, true);
667
668
for (uint32_t a = 0; a < pass->attachment_count; ++a) {
669
if (pass->attachments[a].gmem_offset >= 0)
670
tu_store_gmem_attachment(cmd, cs, a, a);
671
}
672
673
if (subpass->resolve_attachments) {
674
for (unsigned i = 0; i < subpass->resolve_count; i++) {
675
uint32_t a = subpass->resolve_attachments[i].attachment;
676
if (a != VK_ATTACHMENT_UNUSED) {
677
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
678
tu_store_gmem_attachment(cmd, cs, a, gmem_a);
679
}
680
}
681
}
682
}
683
684
static void
685
tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
686
{
687
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
688
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
689
CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
690
CP_SET_DRAW_STATE__0_GROUP_ID(0));
691
tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
692
tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
693
694
cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
695
}
696
697
static void
698
tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
699
{
700
struct tu_device *dev = cmd->device;
701
const struct tu_physical_device *phys_dev = dev->physical_device;
702
703
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
704
705
tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
706
.vs_state = true,
707
.hs_state = true,
708
.ds_state = true,
709
.gs_state = true,
710
.fs_state = true,
711
.cs_state = true,
712
.gfx_ibo = true,
713
.cs_ibo = true,
714
.gfx_shared_const = true,
715
.cs_shared_const = true,
716
.gfx_bindless = 0x1f,
717
.cs_bindless = 0x1f));
718
719
tu_cs_emit_wfi(cs);
720
721
cmd->state.cache.pending_flush_bits &=
722
~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
723
724
tu_cs_emit_regs(cs,
725
A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass));
726
cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
727
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
728
tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);
729
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
730
tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
731
tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
732
tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B600, 0x100000);
733
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
734
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
735
736
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
737
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
738
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
739
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
740
tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
741
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
742
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0);
743
tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
744
tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
745
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
746
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
747
tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL,
748
A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
749
750
/* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
751
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
752
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
753
tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
754
755
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
756
757
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
758
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
759
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
760
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
761
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
762
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
763
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
764
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
765
766
tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
767
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
768
769
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
770
771
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
772
773
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
774
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
775
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
776
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
777
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
778
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
779
tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
780
tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
781
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
782
783
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
784
785
tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
786
787
tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */
788
tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */
789
790
tu_disable_draw_states(cmd, cs);
791
792
tu_cs_emit_regs(cs,
793
A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
794
.bo_offset = gb_offset(bcolor_builtin)));
795
tu_cs_emit_regs(cs,
796
A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
797
.bo_offset = gb_offset(bcolor_builtin)));
798
799
/* VSC buffers:
800
* use vsc pitches from the largest values used so far with this device
801
* if there hasn't been overflow, there will already be a scratch bo
802
* allocated for these sizes
803
*
804
* if overflow is detected, the stream size is increased by 2x
805
*/
806
mtx_lock(&dev->mutex);
807
808
struct tu6_global *global = dev->global_bo.map;
809
810
uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
811
uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
812
813
if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
814
dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
815
816
if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
817
dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
818
819
cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
820
cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
821
822
mtx_unlock(&dev->mutex);
823
824
struct tu_bo *vsc_bo;
825
uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
826
cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
827
828
tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
829
830
tu_cs_emit_regs(cs,
831
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
832
tu_cs_emit_regs(cs,
833
A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
834
tu_cs_emit_regs(cs,
835
A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
836
.bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
837
838
tu_cs_sanity_check(cs);
839
}
840
841
static void
842
update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
843
{
844
const struct tu_framebuffer *fb = cmd->state.framebuffer;
845
846
tu_cs_emit_regs(cs,
847
A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
848
.height = fb->tile0.height));
849
850
tu_cs_emit_regs(cs,
851
A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
852
.ny = fb->tile_count.height));
853
854
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
855
tu_cs_emit_array(cs, fb->pipe_config, 32);
856
857
tu_cs_emit_regs(cs,
858
A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
859
A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
860
861
tu_cs_emit_regs(cs,
862
A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
863
A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
864
}
865
866
static void
867
emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
868
{
869
const struct tu_framebuffer *fb = cmd->state.framebuffer;
870
const uint32_t used_pipe_count =
871
fb->pipe_count.width * fb->pipe_count.height;
872
873
for (int i = 0; i < used_pipe_count; i++) {
874
tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
875
tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
876
CP_COND_WRITE5_0_WRITE_MEMORY);
877
tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
878
tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
879
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
880
tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
881
tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
882
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
883
884
tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
885
tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
886
CP_COND_WRITE5_0_WRITE_MEMORY);
887
tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
888
tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
889
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
890
tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
891
tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
892
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
893
}
894
895
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
896
}
897
898
static void
899
tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
900
{
901
struct tu_physical_device *phys_dev = cmd->device->physical_device;
902
const struct tu_framebuffer *fb = cmd->state.framebuffer;
903
904
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
905
906
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
907
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
908
909
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
910
tu_cs_emit(cs, 0x1);
911
912
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
913
tu_cs_emit(cs, 0x1);
914
915
tu_cs_emit_wfi(cs);
916
917
tu_cs_emit_regs(cs,
918
A6XX_VFD_MODE_CNTL(.binning_pass = true));
919
920
update_vsc_pipe(cmd, cs);
921
922
tu_cs_emit_regs(cs,
923
A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->info->a6xx.magic.PC_UNKNOWN_9805));
924
925
tu_cs_emit_regs(cs,
926
A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->info->a6xx.magic.SP_UNKNOWN_A0F8));
927
928
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
929
tu_cs_emit(cs, UNK_2C);
930
931
tu_cs_emit_regs(cs,
932
A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
933
934
tu_cs_emit_regs(cs,
935
A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
936
937
/* emit IB to binning drawcmds: */
938
tu_cs_emit_call(cs, &cmd->draw_cs);
939
940
/* switching from binning pass to GMEM pass will cause a switch from
941
* PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
942
* so make sure these states are re-emitted
943
* (eventually these states shouldn't exist at all with shader prologue)
944
* only VS and GS are invalidated, as FS isn't emitted in binning pass,
945
* and we don't use HW binning when tesselation is used
946
*/
947
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
948
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
949
CP_SET_DRAW_STATE__0_DISABLE |
950
CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST));
951
tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
952
tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
953
954
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
955
tu_cs_emit(cs, UNK_2D);
956
957
/* This flush is probably required because the VSC, which produces the
958
* visibility stream, is a client of UCHE, whereas the CP needs to read the
959
* visibility stream (without caching) to do draw skipping. The
960
* WFI+WAIT_FOR_ME combination guarantees that the binning commands
961
* submitted are finished before reading the VSC regs (in
962
* emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
963
* part of draws).
964
*/
965
tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
966
967
tu_cs_emit_wfi(cs);
968
969
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
970
971
emit_vsc_overflow_test(cmd, cs);
972
973
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
974
tu_cs_emit(cs, 0x0);
975
976
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
977
tu_cs_emit(cs, 0x0);
978
}
979
980
static struct tu_draw_state
981
tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
982
const struct tu_subpass *subpass,
983
bool gmem)
984
{
985
/* note: we can probably emit input attachments just once for the whole
986
* renderpass, this would avoid emitting both sysmem/gmem versions
987
*
988
* emit two texture descriptors for each input, as a workaround for
989
* d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)
990
* tu_shader lowers uint input attachment loads to use the 2nd descriptor
991
* in the pair
992
* TODO: a smarter workaround
993
*/
994
995
if (!subpass->input_count)
996
return (struct tu_draw_state) {};
997
998
struct tu_cs_memory texture;
999
VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
1000
A6XX_TEX_CONST_DWORDS, &texture);
1001
if (result != VK_SUCCESS) {
1002
cmd->record_result = result;
1003
return (struct tu_draw_state) {};
1004
}
1005
1006
for (unsigned i = 0; i < subpass->input_count * 2; i++) {
1007
uint32_t a = subpass->input_attachments[i / 2].attachment;
1008
if (a == VK_ATTACHMENT_UNUSED)
1009
continue;
1010
1011
struct tu_image_view *iview =
1012
cmd->state.framebuffer->attachments[a].attachment;
1013
const struct tu_render_pass_attachment *att =
1014
&cmd->state.pass->attachments[a];
1015
uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
1016
uint32_t gmem_offset = att->gmem_offset;
1017
uint32_t cpp = att->cpp;
1018
1019
memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
1020
1021
if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
1022
/* note this works because spec says fb and input attachments
1023
* must use identity swizzle
1024
*/
1025
dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1026
A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1027
A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1028
if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {
1029
dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |
1030
A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |
1031
A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1032
A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1033
A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1034
} else {
1035
dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |
1036
A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
1037
A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1038
A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1039
A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1040
}
1041
}
1042
1043
if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1044
dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1045
dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
1046
dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
1047
dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6);
1048
dst[3] = 0;
1049
dst[4] = iview->stencil_base_addr;
1050
dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
1051
1052
cpp = att->samples;
1053
gmem_offset = att->gmem_offset_stencil;
1054
}
1055
1056
if (!gmem)
1057
continue;
1058
1059
/* patched for gmem */
1060
dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1061
dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1062
dst[2] =
1063
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1064
A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
1065
dst[3] = 0;
1066
dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1067
dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
1068
for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1069
dst[i] = 0;
1070
}
1071
1072
struct tu_cs cs;
1073
struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
1074
1075
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
1076
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1077
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1078
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1079
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1080
CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
1081
tu_cs_emit_qw(&cs, texture.iova);
1082
1083
tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1084
1085
tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
1086
1087
assert(cs.cur == cs.end); /* validate draw state size */
1088
1089
return ds;
1090
}
1091
1092
static void
1093
tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1094
{
1095
struct tu_cs *cs = &cmd->draw_cs;
1096
1097
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
1098
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
1099
tu_emit_input_attachments(cmd, subpass, true));
1100
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
1101
tu_emit_input_attachments(cmd, subpass, false));
1102
}
1103
1104
static void
1105
tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
1106
const VkRenderPassBeginInfo *info)
1107
{
1108
struct tu_cs *cs = &cmd->draw_cs;
1109
1110
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1111
1112
tu6_emit_blit_scissor(cmd, cs, true);
1113
1114
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1115
tu_load_gmem_attachment(cmd, cs, i, false);
1116
1117
tu6_emit_blit_scissor(cmd, cs, false);
1118
1119
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1120
tu_clear_gmem_attachment(cmd, cs, i, info);
1121
1122
tu_cond_exec_end(cs);
1123
1124
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1125
1126
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1127
tu_clear_sysmem_attachment(cmd, cs, i, info);
1128
1129
tu_cond_exec_end(cs);
1130
}
1131
1132
static void
1133
tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1134
{
1135
const struct tu_framebuffer *fb = cmd->state.framebuffer;
1136
1137
assert(fb->width > 0 && fb->height > 0);
1138
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1139
tu6_emit_window_offset(cs, 0, 0);
1140
1141
tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */
1142
1143
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1144
1145
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1146
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1147
1148
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1149
tu_cs_emit(cs, 0x0);
1150
1151
tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1152
1153
/* enable stream-out, with sysmem there is only one pass: */
1154
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1155
1156
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1157
tu_cs_emit(cs, 0x1);
1158
1159
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1160
tu_cs_emit(cs, 0x0);
1161
1162
tu_cs_sanity_check(cs);
1163
}
1164
1165
static void
1166
tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1167
{
1168
/* Do any resolves of the last subpass. These are handled in the
1169
* tile_store_ib in the gmem path.
1170
*/
1171
tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1172
1173
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1174
1175
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1176
tu_cs_emit(cs, 0x0);
1177
1178
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1179
1180
tu_cs_sanity_check(cs);
1181
}
1182
1183
static void
1184
tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1185
{
1186
struct tu_physical_device *phys_dev = cmd->device->physical_device;
1187
1188
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1189
1190
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1191
tu_cs_emit(cs, 0x0);
1192
1193
tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1194
1195
const struct tu_framebuffer *fb = cmd->state.framebuffer;
1196
if (use_hw_binning(cmd)) {
1197
/* enable stream-out during binning pass: */
1198
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1199
1200
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1201
A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
1202
1203
tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1204
1205
tu6_emit_binning_pass(cmd, cs);
1206
1207
/* and disable stream-out for draw pass: */
1208
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
1209
1210
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1211
A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
1212
1213
tu_cs_emit_regs(cs,
1214
A6XX_VFD_MODE_CNTL(0));
1215
1216
tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->info->a6xx.magic.PC_UNKNOWN_9805));
1217
1218
tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->info->a6xx.magic.SP_UNKNOWN_A0F8));
1219
1220
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1221
tu_cs_emit(cs, 0x1);
1222
} else {
1223
/* no binning pass, so enable stream-out for draw pass:: */
1224
tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1225
1226
tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 0x6000000);
1227
}
1228
1229
tu_cs_sanity_check(cs);
1230
}
1231
1232
static void
1233
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1234
{
1235
tu_cs_emit_call(cs, &cmd->draw_cs);
1236
1237
if (use_hw_binning(cmd)) {
1238
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1239
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1240
}
1241
1242
tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
1243
1244
tu_cs_sanity_check(cs);
1245
}
1246
1247
static void
1248
tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1249
{
1250
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1251
1252
tu_cs_emit_regs(cs,
1253
A6XX_GRAS_LRZ_CNTL(0));
1254
1255
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1256
1257
tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1258
1259
tu_cs_sanity_check(cs);
1260
}
1261
1262
static void
1263
tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1264
{
1265
const struct tu_framebuffer *fb = cmd->state.framebuffer;
1266
1267
tu6_tile_render_begin(cmd, &cmd->cs);
1268
1269
uint32_t pipe = 0;
1270
for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
1271
for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) {
1272
uint32_t tx1 = px * fb->pipe0.width;
1273
uint32_t ty1 = py * fb->pipe0.height;
1274
uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
1275
uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
1276
uint32_t slot = 0;
1277
for (uint32_t ty = ty1; ty < ty2; ty++) {
1278
for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
1279
tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
1280
tu6_render_tile(cmd, &cmd->cs);
1281
}
1282
}
1283
}
1284
}
1285
1286
tu6_tile_render_end(cmd, &cmd->cs);
1287
}
1288
1289
static void
1290
tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1291
{
1292
tu6_sysmem_render_begin(cmd, &cmd->cs);
1293
1294
tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1295
1296
tu6_sysmem_render_end(cmd, &cmd->cs);
1297
}
1298
1299
static void
1300
tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
1301
{
1302
const uint32_t tile_store_space = 7 + (35 * 2) * cmd->state.pass->attachment_count;
1303
struct tu_cs sub_cs;
1304
1305
VkResult result =
1306
tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);
1307
if (result != VK_SUCCESS) {
1308
cmd->record_result = result;
1309
return;
1310
}
1311
1312
/* emit to tile-store sub_cs */
1313
tu6_emit_tile_store(cmd, &sub_cs);
1314
1315
cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);
1316
}
1317
1318
static VkResult
1319
tu_create_cmd_buffer(struct tu_device *device,
1320
struct tu_cmd_pool *pool,
1321
VkCommandBufferLevel level,
1322
VkCommandBuffer *pCommandBuffer)
1323
{
1324
struct tu_cmd_buffer *cmd_buffer;
1325
1326
cmd_buffer = vk_object_zalloc(&device->vk, NULL, sizeof(*cmd_buffer),
1327
VK_OBJECT_TYPE_COMMAND_BUFFER);
1328
if (cmd_buffer == NULL)
1329
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1330
1331
cmd_buffer->device = device;
1332
cmd_buffer->pool = pool;
1333
cmd_buffer->level = level;
1334
1335
if (pool) {
1336
list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1337
cmd_buffer->queue_family_index = pool->queue_family_index;
1338
1339
} else {
1340
/* Init the pool_link so we can safely call list_del when we destroy
1341
* the command buffer
1342
*/
1343
list_inithead(&cmd_buffer->pool_link);
1344
cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1345
}
1346
1347
tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1348
tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1349
tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1350
tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1351
1352
*pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1353
1354
return VK_SUCCESS;
1355
}
1356
1357
static void
1358
tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1359
{
1360
list_del(&cmd_buffer->pool_link);
1361
1362
tu_cs_finish(&cmd_buffer->cs);
1363
tu_cs_finish(&cmd_buffer->draw_cs);
1364
tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1365
tu_cs_finish(&cmd_buffer->sub_cs);
1366
1367
vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
1368
}
1369
1370
static VkResult
1371
tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1372
{
1373
cmd_buffer->record_result = VK_SUCCESS;
1374
1375
tu_cs_reset(&cmd_buffer->cs);
1376
tu_cs_reset(&cmd_buffer->draw_cs);
1377
tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1378
tu_cs_reset(&cmd_buffer->sub_cs);
1379
1380
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
1381
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
1382
memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
1383
}
1384
1385
cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1386
1387
return cmd_buffer->record_result;
1388
}
1389
1390
VKAPI_ATTR VkResult VKAPI_CALL
1391
tu_AllocateCommandBuffers(VkDevice _device,
1392
const VkCommandBufferAllocateInfo *pAllocateInfo,
1393
VkCommandBuffer *pCommandBuffers)
1394
{
1395
TU_FROM_HANDLE(tu_device, device, _device);
1396
TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1397
1398
VkResult result = VK_SUCCESS;
1399
uint32_t i;
1400
1401
for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1402
1403
if (!list_is_empty(&pool->free_cmd_buffers)) {
1404
struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1405
&pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1406
1407
list_del(&cmd_buffer->pool_link);
1408
list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1409
1410
result = tu_reset_cmd_buffer(cmd_buffer);
1411
cmd_buffer->level = pAllocateInfo->level;
1412
vk_object_base_reset(&cmd_buffer->base);
1413
1414
pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1415
} else {
1416
result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1417
&pCommandBuffers[i]);
1418
}
1419
if (result != VK_SUCCESS)
1420
break;
1421
}
1422
1423
if (result != VK_SUCCESS) {
1424
tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1425
pCommandBuffers);
1426
1427
/* From the Vulkan 1.0.66 spec:
1428
*
1429
* "vkAllocateCommandBuffers can be used to create multiple
1430
* command buffers. If the creation of any of those command
1431
* buffers fails, the implementation must destroy all
1432
* successfully created command buffer objects from this
1433
* command, set all entries of the pCommandBuffers array to
1434
* NULL and return the error."
1435
*/
1436
memset(pCommandBuffers, 0,
1437
sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1438
}
1439
1440
return result;
1441
}
1442
1443
VKAPI_ATTR void VKAPI_CALL
1444
tu_FreeCommandBuffers(VkDevice device,
1445
VkCommandPool commandPool,
1446
uint32_t commandBufferCount,
1447
const VkCommandBuffer *pCommandBuffers)
1448
{
1449
for (uint32_t i = 0; i < commandBufferCount; i++) {
1450
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1451
1452
if (cmd_buffer) {
1453
if (cmd_buffer->pool) {
1454
list_del(&cmd_buffer->pool_link);
1455
list_addtail(&cmd_buffer->pool_link,
1456
&cmd_buffer->pool->free_cmd_buffers);
1457
} else
1458
tu_cmd_buffer_destroy(cmd_buffer);
1459
}
1460
}
1461
}
1462
1463
VKAPI_ATTR VkResult VKAPI_CALL
1464
tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1465
VkCommandBufferResetFlags flags)
1466
{
1467
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1468
return tu_reset_cmd_buffer(cmd_buffer);
1469
}
1470
1471
/* Initialize the cache, assuming all necessary flushes have happened but *not*
1472
* invalidations.
1473
*/
1474
static void
1475
tu_cache_init(struct tu_cache_state *cache)
1476
{
1477
cache->flush_bits = 0;
1478
cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1479
}
1480
1481
VKAPI_ATTR VkResult VKAPI_CALL
1482
tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1483
const VkCommandBufferBeginInfo *pBeginInfo)
1484
{
1485
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1486
VkResult result = VK_SUCCESS;
1487
1488
if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1489
/* If the command buffer has already been resetted with
1490
* vkResetCommandBuffer, no need to do it again.
1491
*/
1492
result = tu_reset_cmd_buffer(cmd_buffer);
1493
if (result != VK_SUCCESS)
1494
return result;
1495
}
1496
1497
memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1498
cmd_buffer->state.index_size = 0xff; /* dirty restart index */
1499
1500
cmd_buffer->state.last_vs_params.first_instance = -1;
1501
cmd_buffer->state.last_vs_params.params_offset = -1;
1502
cmd_buffer->state.last_vs_params.vertex_offset = -1;
1503
1504
tu_cache_init(&cmd_buffer->state.cache);
1505
tu_cache_init(&cmd_buffer->state.renderpass_cache);
1506
cmd_buffer->usage_flags = pBeginInfo->flags;
1507
1508
tu_cs_begin(&cmd_buffer->cs);
1509
tu_cs_begin(&cmd_buffer->draw_cs);
1510
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1511
1512
/* setup initial configuration into command buffer */
1513
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1514
switch (cmd_buffer->queue_family_index) {
1515
case TU_QUEUE_GENERAL:
1516
tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1517
break;
1518
default:
1519
break;
1520
}
1521
} else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1522
assert(pBeginInfo->pInheritanceInfo);
1523
1524
vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
1525
switch (ext->sType) {
1526
case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
1527
const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;
1528
cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
1529
break;
1530
default:
1531
break;
1532
}
1533
}
1534
}
1535
1536
if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1537
cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1538
cmd_buffer->state.subpass =
1539
&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1540
} else {
1541
/* When executing in the middle of another command buffer, the CCU
1542
* state is unknown.
1543
*/
1544
cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1545
}
1546
}
1547
1548
cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1549
1550
return VK_SUCCESS;
1551
}
1552
1553
VKAPI_ATTR void VKAPI_CALL
1554
tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1555
uint32_t firstBinding,
1556
uint32_t bindingCount,
1557
const VkBuffer *pBuffers,
1558
const VkDeviceSize *pOffsets)
1559
{
1560
tu_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount,
1561
pBuffers, pOffsets, NULL, NULL);
1562
}
1563
1564
VKAPI_ATTR void VKAPI_CALL
1565
tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,
1566
uint32_t firstBinding,
1567
uint32_t bindingCount,
1568
const VkBuffer* pBuffers,
1569
const VkDeviceSize* pOffsets,
1570
const VkDeviceSize* pSizes,
1571
const VkDeviceSize* pStrides)
1572
{
1573
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1574
struct tu_cs cs;
1575
/* TODO: track a "max_vb" value for the cmdbuf to save a bit of memory */
1576
cmd->state.vertex_buffers.iova = tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * MAX_VBS).iova;
1577
1578
for (uint32_t i = 0; i < bindingCount; i++) {
1579
if (pBuffers[i] == VK_NULL_HANDLE) {
1580
cmd->state.vb[firstBinding + i].base = 0;
1581
cmd->state.vb[firstBinding + i].size = 0;
1582
} else {
1583
struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1584
cmd->state.vb[firstBinding + i].base = tu_buffer_iova(buf) + pOffsets[i];
1585
cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->size - pOffsets[i]);
1586
}
1587
1588
if (pStrides)
1589
cmd->state.vb[firstBinding + i].stride = pStrides[i];
1590
}
1591
1592
for (uint32_t i = 0; i < MAX_VBS; i++) {
1593
tu_cs_emit_regs(&cs,
1594
A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),
1595
A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));
1596
}
1597
1598
cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1599
1600
if (pStrides) {
1601
cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova =
1602
tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * MAX_VBS).iova;
1603
1604
for (uint32_t i = 0; i < MAX_VBS; i++)
1605
tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride));
1606
1607
cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
1608
}
1609
}
1610
1611
VKAPI_ATTR void VKAPI_CALL
1612
tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1613
VkBuffer buffer,
1614
VkDeviceSize offset,
1615
VkIndexType indexType)
1616
{
1617
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1618
TU_FROM_HANDLE(tu_buffer, buf, buffer);
1619
1620
1621
1622
uint32_t index_size, index_shift, restart_index;
1623
1624
switch (indexType) {
1625
case VK_INDEX_TYPE_UINT16:
1626
index_size = INDEX4_SIZE_16_BIT;
1627
index_shift = 1;
1628
restart_index = 0xffff;
1629
break;
1630
case VK_INDEX_TYPE_UINT32:
1631
index_size = INDEX4_SIZE_32_BIT;
1632
index_shift = 2;
1633
restart_index = 0xffffffff;
1634
break;
1635
case VK_INDEX_TYPE_UINT8_EXT:
1636
index_size = INDEX4_SIZE_8_BIT;
1637
index_shift = 0;
1638
restart_index = 0xff;
1639
break;
1640
default:
1641
unreachable("invalid VkIndexType");
1642
}
1643
1644
/* initialize/update the restart index */
1645
if (cmd->state.index_size != index_size)
1646
tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
1647
1648
assert(buf->size >= offset);
1649
1650
cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;
1651
cmd->state.max_index_count = (buf->size - offset) >> index_shift;
1652
cmd->state.index_size = index_size;
1653
}
1654
1655
VKAPI_ATTR void VKAPI_CALL
1656
tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1657
VkPipelineBindPoint pipelineBindPoint,
1658
VkPipelineLayout _layout,
1659
uint32_t firstSet,
1660
uint32_t descriptorSetCount,
1661
const VkDescriptorSet *pDescriptorSets,
1662
uint32_t dynamicOffsetCount,
1663
const uint32_t *pDynamicOffsets)
1664
{
1665
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1666
TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1667
unsigned dyn_idx = 0;
1668
1669
struct tu_descriptor_state *descriptors_state =
1670
tu_get_descriptors_state(cmd, pipelineBindPoint);
1671
1672
for (unsigned i = 0; i < descriptorSetCount; ++i) {
1673
unsigned idx = i + firstSet;
1674
TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1675
1676
descriptors_state->sets[idx] = set;
1677
1678
for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1679
/* update the contents of the dynamic descriptor set */
1680
unsigned src_idx = j;
1681
unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1682
assert(dyn_idx < dynamicOffsetCount);
1683
1684
uint32_t *dst =
1685
&descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1686
uint32_t *src =
1687
&set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1688
uint32_t offset = pDynamicOffsets[dyn_idx];
1689
1690
/* Patch the storage/uniform descriptors right away. */
1691
if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1692
/* Note: we can assume here that the addition won't roll over and
1693
* change the SIZE field.
1694
*/
1695
uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1696
va += offset;
1697
dst[0] = va;
1698
dst[1] = va >> 32;
1699
} else {
1700
memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1701
/* Note: A6XX_IBO_5_DEPTH is always 0 */
1702
uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1703
va += offset;
1704
dst[4] = va;
1705
dst[5] = va >> 32;
1706
}
1707
}
1708
}
1709
assert(dyn_idx == dynamicOffsetCount);
1710
1711
uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;
1712
uint64_t addr[MAX_SETS + 1] = {};
1713
struct tu_cs *cs, state_cs;
1714
1715
for (uint32_t i = 0; i < MAX_SETS; i++) {
1716
struct tu_descriptor_set *set = descriptors_state->sets[i];
1717
if (set)
1718
addr[i] = set->va | 3;
1719
}
1720
1721
if (layout->dynamic_offset_count) {
1722
/* allocate and fill out dynamic descriptor set */
1723
struct tu_cs_memory dynamic_desc_set;
1724
VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
1725
A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
1726
if (result != VK_SUCCESS) {
1727
cmd->record_result = result;
1728
return;
1729
}
1730
1731
memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
1732
layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
1733
addr[MAX_SETS] = dynamic_desc_set.iova | 3;
1734
}
1735
1736
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1737
sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
1738
hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
1739
hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
1740
1741
cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);
1742
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;
1743
cs = &state_cs;
1744
} else {
1745
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
1746
1747
sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
1748
hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
1749
hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
1750
1751
cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
1752
cs = &cmd->cs;
1753
}
1754
1755
tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 10);
1756
tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
1757
tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 10);
1758
tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
1759
tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));
1760
1761
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1762
assert(cs->cur == cs->end); /* validate draw state size */
1763
/* note: this also avoids emitting draw states before renderpass clears,
1764
* which may use the 3D clear path (for MSAA cases)
1765
*/
1766
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
1767
tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
1768
tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
1769
}
1770
}
1771
}
1772
1773
VKAPI_ATTR void VKAPI_CALL
1774
tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,
1775
VkPipelineBindPoint pipelineBindPoint,
1776
VkPipelineLayout _layout,
1777
uint32_t _set,
1778
uint32_t descriptorWriteCount,
1779
const VkWriteDescriptorSet *pDescriptorWrites)
1780
{
1781
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1782
TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
1783
struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
1784
struct tu_descriptor_set *set =
1785
&tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set;
1786
1787
struct tu_cs_memory set_mem;
1788
VkResult result = tu_cs_alloc(&cmd->sub_cs,
1789
DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
1790
A6XX_TEX_CONST_DWORDS, &set_mem);
1791
if (result != VK_SUCCESS) {
1792
cmd->record_result = result;
1793
return;
1794
}
1795
1796
/* preserve previous content if the layout is the same: */
1797
if (set->layout == layout)
1798
memcpy(set_mem.map, set->mapped_ptr, layout->size);
1799
1800
set->layout = layout;
1801
set->mapped_ptr = set_mem.map;
1802
set->va = set_mem.iova;
1803
1804
tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),
1805
descriptorWriteCount, pDescriptorWrites, 0, NULL);
1806
1807
tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set,
1808
1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
1809
0, NULL);
1810
}
1811
1812
VKAPI_ATTR void VKAPI_CALL
1813
tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
1814
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1815
VkPipelineLayout _layout,
1816
uint32_t _set,
1817
const void* pData)
1818
{
1819
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1820
TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
1821
TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate);
1822
struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
1823
struct tu_descriptor_set *set =
1824
&tu_get_descriptors_state(cmd, templ->bind_point)->push_set;
1825
1826
struct tu_cs_memory set_mem;
1827
VkResult result = tu_cs_alloc(&cmd->sub_cs,
1828
DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
1829
A6XX_TEX_CONST_DWORDS, &set_mem);
1830
if (result != VK_SUCCESS) {
1831
cmd->record_result = result;
1832
return;
1833
}
1834
1835
/* preserve previous content if the layout is the same: */
1836
if (set->layout == layout)
1837
memcpy(set_mem.map, set->mapped_ptr, layout->size);
1838
1839
set->layout = layout;
1840
set->mapped_ptr = set_mem.map;
1841
set->va = set_mem.iova;
1842
1843
tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData);
1844
1845
tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set,
1846
1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
1847
0, NULL);
1848
}
1849
1850
VKAPI_ATTR void VKAPI_CALL
1851
tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1852
uint32_t firstBinding,
1853
uint32_t bindingCount,
1854
const VkBuffer *pBuffers,
1855
const VkDeviceSize *pOffsets,
1856
const VkDeviceSize *pSizes)
1857
{
1858
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1859
struct tu_cs *cs = &cmd->draw_cs;
1860
1861
/* using COND_REG_EXEC for xfb commands matches the blob behavior
1862
* presumably there isn't any benefit using a draw state when the
1863
* condition is (SYSMEM | BINNING)
1864
*/
1865
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1866
CP_COND_REG_EXEC_0_SYSMEM |
1867
CP_COND_REG_EXEC_0_BINNING);
1868
1869
for (uint32_t i = 0; i < bindingCount; i++) {
1870
TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
1871
uint64_t iova = buf->bo->iova + pOffsets[i];
1872
uint32_t size = buf->bo->size - pOffsets[i];
1873
uint32_t idx = i + firstBinding;
1874
1875
if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
1876
size = pSizes[i];
1877
1878
/* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
1879
uint32_t offset = iova & 0x1f;
1880
iova &= ~(uint64_t) 0x1f;
1881
1882
tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
1883
tu_cs_emit_qw(cs, iova);
1884
tu_cs_emit(cs, size + offset);
1885
1886
cmd->state.streamout_offset[idx] = offset;
1887
}
1888
1889
tu_cond_exec_end(cs);
1890
}
1891
1892
VKAPI_ATTR void VKAPI_CALL
1893
tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1894
uint32_t firstCounterBuffer,
1895
uint32_t counterBufferCount,
1896
const VkBuffer *pCounterBuffers,
1897
const VkDeviceSize *pCounterBufferOffsets)
1898
{
1899
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1900
struct tu_cs *cs = &cmd->draw_cs;
1901
1902
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1903
CP_COND_REG_EXEC_0_SYSMEM |
1904
CP_COND_REG_EXEC_0_BINNING);
1905
1906
/* TODO: only update offset for active buffers */
1907
for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
1908
tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
1909
1910
for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
1911
uint32_t idx = firstCounterBuffer + i;
1912
uint32_t offset = cmd->state.streamout_offset[idx];
1913
uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
1914
1915
if (!pCounterBuffers[i])
1916
continue;
1917
1918
TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1919
1920
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1921
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1922
CP_MEM_TO_REG_0_UNK31 |
1923
CP_MEM_TO_REG_0_CNT(1));
1924
tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
1925
1926
if (offset) {
1927
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1928
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
1929
CP_REG_RMW_0_SRC1_ADD);
1930
tu_cs_emit_qw(cs, 0xffffffff);
1931
tu_cs_emit_qw(cs, offset);
1932
}
1933
}
1934
1935
tu_cond_exec_end(cs);
1936
}
1937
1938
VKAPI_ATTR void VKAPI_CALL
1939
tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1940
uint32_t firstCounterBuffer,
1941
uint32_t counterBufferCount,
1942
const VkBuffer *pCounterBuffers,
1943
const VkDeviceSize *pCounterBufferOffsets)
1944
{
1945
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1946
struct tu_cs *cs = &cmd->draw_cs;
1947
1948
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1949
CP_COND_REG_EXEC_0_SYSMEM |
1950
CP_COND_REG_EXEC_0_BINNING);
1951
1952
/* TODO: only flush buffers that need to be flushed */
1953
for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1954
/* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
1955
tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
1956
tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
1957
tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
1958
}
1959
1960
for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
1961
uint32_t idx = firstCounterBuffer + i;
1962
uint32_t offset = cmd->state.streamout_offset[idx];
1963
uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
1964
1965
if (!pCounterBuffers[i])
1966
continue;
1967
1968
TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
1969
1970
/* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
1971
tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
1972
tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1973
CP_MEM_TO_REG_0_SHIFT_BY_2 |
1974
0x40000 | /* ??? */
1975
CP_MEM_TO_REG_0_UNK31 |
1976
CP_MEM_TO_REG_0_CNT(1));
1977
tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
1978
1979
if (offset) {
1980
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
1981
tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1982
CP_REG_RMW_0_SRC1_ADD);
1983
tu_cs_emit_qw(cs, 0xffffffff);
1984
tu_cs_emit_qw(cs, -offset);
1985
}
1986
1987
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1988
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
1989
CP_REG_TO_MEM_0_CNT(1));
1990
tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
1991
}
1992
1993
tu_cond_exec_end(cs);
1994
1995
cmd->state.xfb_used = true;
1996
}
1997
1998
VKAPI_ATTR void VKAPI_CALL
1999
tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2000
VkPipelineLayout layout,
2001
VkShaderStageFlags stageFlags,
2002
uint32_t offset,
2003
uint32_t size,
2004
const void *pValues)
2005
{
2006
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2007
memcpy((void*) cmd->push_constants + offset, pValues, size);
2008
cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2009
}
2010
2011
/* Flush everything which has been made available but we haven't actually
2012
* flushed yet.
2013
*/
2014
static void
2015
tu_flush_all_pending(struct tu_cache_state *cache)
2016
{
2017
cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2018
cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
2019
}
2020
2021
VKAPI_ATTR VkResult VKAPI_CALL
2022
tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2023
{
2024
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2025
2026
/* We currently flush CCU at the end of the command buffer, like
2027
* what the blob does. There's implicit synchronization around every
2028
* vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
2029
* know yet if this command buffer will be the last in the submit so we
2030
* have to defensively flush everything else.
2031
*
2032
* TODO: We could definitely do better than this, since these flushes
2033
* aren't required by Vulkan, but we'd need kernel support to do that.
2034
* Ideally, we'd like the kernel to flush everything afterwards, so that we
2035
* wouldn't have to do any flushes here, and when submitting multiple
2036
* command buffers there wouldn't be any unnecessary flushes in between.
2037
*/
2038
if (cmd_buffer->state.pass) {
2039
tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
2040
tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
2041
} else {
2042
tu_flush_all_pending(&cmd_buffer->state.cache);
2043
cmd_buffer->state.cache.flush_bits |=
2044
TU_CMD_FLAG_CCU_FLUSH_COLOR |
2045
TU_CMD_FLAG_CCU_FLUSH_DEPTH;
2046
tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
2047
}
2048
2049
tu_cs_end(&cmd_buffer->cs);
2050
tu_cs_end(&cmd_buffer->draw_cs);
2051
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2052
2053
cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2054
2055
return cmd_buffer->record_result;
2056
}
2057
2058
static struct tu_cs
2059
tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
2060
{
2061
struct tu_cs cs;
2062
2063
assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
2064
cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);
2065
2066
/* note: this also avoids emitting draw states before renderpass clears,
2067
* which may use the 3D clear path (for MSAA cases)
2068
*/
2069
if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
2070
return cs;
2071
2072
tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2073
tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2074
2075
return cs;
2076
}
2077
2078
VKAPI_ATTR void VKAPI_CALL
2079
tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2080
VkPipelineBindPoint pipelineBindPoint,
2081
VkPipeline _pipeline)
2082
{
2083
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2084
TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2085
2086
if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2087
cmd->state.compute_pipeline = pipeline;
2088
tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);
2089
return;
2090
}
2091
2092
assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2093
2094
cmd->state.pipeline = pipeline;
2095
cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS | TU_CMD_DIRTY_LRZ;
2096
2097
/* note: this also avoids emitting draw states before renderpass clears,
2098
* which may use the 3D clear path (for MSAA cases)
2099
*/
2100
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
2101
struct tu_cs *cs = &cmd->draw_cs;
2102
uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
2103
2104
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
2105
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
2106
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
2107
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
2108
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
2109
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
2110
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
2111
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
2112
2113
u_foreach_bit(i, mask)
2114
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
2115
}
2116
2117
/* the vertex_buffers draw state always contains all the currently
2118
* bound vertex buffers. update its size to only emit the vbs which
2119
* are actually used by the pipeline
2120
* note there is a HW optimization which makes it so the draw state
2121
* is not re-executed completely when only the size changes
2122
*/
2123
if (cmd->state.vertex_buffers.size != pipeline->num_vbs * 4) {
2124
cmd->state.vertex_buffers.size = pipeline->num_vbs * 4;
2125
cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2126
}
2127
2128
if ((pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE)) &&
2129
cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != pipeline->num_vbs * 2) {
2130
cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = pipeline->num_vbs * 2;
2131
cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
2132
}
2133
2134
#define UPDATE_REG(X, Y) { \
2135
/* note: would be better to have pipeline bits already masked */ \
2136
uint32_t pipeline_bits = pipeline->X & pipeline->X##_mask; \
2137
if ((cmd->state.X & pipeline->X##_mask) != pipeline_bits) { \
2138
cmd->state.X &= ~pipeline->X##_mask; \
2139
cmd->state.X |= pipeline_bits; \
2140
cmd->state.dirty |= TU_CMD_DIRTY_##Y; \
2141
} \
2142
if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y))) \
2143
cmd->state.dirty &= ~TU_CMD_DIRTY_##Y; \
2144
}
2145
2146
/* these registers can have bits set from both pipeline and dynamic state
2147
* this updates the bits set by the pipeline
2148
* if the pipeline doesn't use a dynamic state for the register, then
2149
* the relevant dirty bit is cleared to avoid overriding the non-dynamic
2150
* state with a dynamic state the next draw.
2151
*/
2152
UPDATE_REG(gras_su_cntl, GRAS_SU_CNTL);
2153
UPDATE_REG(rb_depth_cntl, RB_DEPTH_CNTL);
2154
UPDATE_REG(rb_stencil_cntl, RB_STENCIL_CNTL);
2155
#undef UPDATE_REG
2156
2157
if (pipeline->rb_depth_cntl_disable)
2158
cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2159
}
2160
2161
VKAPI_ATTR void VKAPI_CALL
2162
tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2163
uint32_t firstViewport,
2164
uint32_t viewportCount,
2165
const VkViewport *pViewports)
2166
{
2167
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2168
struct tu_cs cs;
2169
2170
memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports));
2171
cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount);
2172
2173
cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport);
2174
tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport);
2175
}
2176
2177
VKAPI_ATTR void VKAPI_CALL
2178
tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2179
uint32_t firstScissor,
2180
uint32_t scissorCount,
2181
const VkRect2D *pScissors)
2182
{
2183
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2184
struct tu_cs cs;
2185
2186
memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors));
2187
cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount);
2188
2189
cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor);
2190
tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor);
2191
}
2192
2193
VKAPI_ATTR void VKAPI_CALL
2194
tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2195
{
2196
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2197
2198
cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2199
cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
2200
2201
cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2202
}
2203
2204
VKAPI_ATTR void VKAPI_CALL
2205
tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2206
float depthBiasConstantFactor,
2207
float depthBiasClamp,
2208
float depthBiasSlopeFactor)
2209
{
2210
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2211
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
2212
2213
tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
2214
}
2215
2216
VKAPI_ATTR void VKAPI_CALL
2217
tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2218
const float blendConstants[4])
2219
{
2220
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2221
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
2222
2223
tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2224
tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
2225
}
2226
2227
VKAPI_ATTR void VKAPI_CALL
2228
tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2229
float minDepthBounds,
2230
float maxDepthBounds)
2231
{
2232
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2233
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
2234
2235
tu_cs_emit_regs(&cs,
2236
A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
2237
A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
2238
}
2239
2240
void
2241
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
2242
{
2243
if (face & VK_STENCIL_FACE_FRONT_BIT)
2244
*value = (*value & 0xff00) | (mask & 0xff);
2245
if (face & VK_STENCIL_FACE_BACK_BIT)
2246
*value = (*value & 0xff) | (mask & 0xff) << 8;
2247
}
2248
2249
VKAPI_ATTR void VKAPI_CALL
2250
tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2251
VkStencilFaceFlags faceMask,
2252
uint32_t compareMask)
2253
{
2254
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2255
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
2256
2257
update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
2258
2259
tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
2260
}
2261
2262
VKAPI_ATTR void VKAPI_CALL
2263
tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2264
VkStencilFaceFlags faceMask,
2265
uint32_t writeMask)
2266
{
2267
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2268
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
2269
2270
update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
2271
2272
tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
2273
2274
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2275
}
2276
2277
VKAPI_ATTR void VKAPI_CALL
2278
tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2279
VkStencilFaceFlags faceMask,
2280
uint32_t reference)
2281
{
2282
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2283
struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
2284
2285
update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
2286
2287
tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
2288
}
2289
2290
VKAPI_ATTR void VKAPI_CALL
2291
tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2292
const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2293
{
2294
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2295
struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
2296
2297
assert(pSampleLocationsInfo);
2298
2299
tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
2300
}
2301
2302
VKAPI_ATTR void VKAPI_CALL
2303
tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
2304
{
2305
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2306
2307
cmd->state.gras_su_cntl &=
2308
~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK);
2309
2310
if (cullMode & VK_CULL_MODE_FRONT_BIT)
2311
cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
2312
if (cullMode & VK_CULL_MODE_BACK_BIT)
2313
cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
2314
2315
cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2316
}
2317
2318
VKAPI_ATTR void VKAPI_CALL
2319
tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
2320
{
2321
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2322
2323
cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2324
2325
if (frontFace == VK_FRONT_FACE_CLOCKWISE)
2326
cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
2327
2328
cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2329
}
2330
2331
VKAPI_ATTR void VKAPI_CALL
2332
tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
2333
VkPrimitiveTopology primitiveTopology)
2334
{
2335
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2336
2337
cmd->state.primtype = tu6_primtype(primitiveTopology);
2338
}
2339
2340
VKAPI_ATTR void VKAPI_CALL
2341
tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,
2342
uint32_t viewportCount,
2343
const VkViewport* pViewports)
2344
{
2345
tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
2346
}
2347
2348
VKAPI_ATTR void VKAPI_CALL
2349
tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,
2350
uint32_t scissorCount,
2351
const VkRect2D* pScissors)
2352
{
2353
tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
2354
}
2355
2356
VKAPI_ATTR void VKAPI_CALL
2357
tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,
2358
VkBool32 depthTestEnable)
2359
{
2360
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2361
2362
cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_ENABLE;
2363
2364
if (depthTestEnable)
2365
cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_ENABLE;
2366
2367
cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2368
}
2369
2370
VKAPI_ATTR void VKAPI_CALL
2371
tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,
2372
VkBool32 depthWriteEnable)
2373
{
2374
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2375
2376
cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2377
2378
if (depthWriteEnable)
2379
cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2380
2381
cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2382
}
2383
2384
VKAPI_ATTR void VKAPI_CALL
2385
tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,
2386
VkCompareOp depthCompareOp)
2387
{
2388
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2389
2390
cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2391
2392
cmd->state.rb_depth_cntl |=
2393
A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp));
2394
2395
cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2396
}
2397
2398
VKAPI_ATTR void VKAPI_CALL
2399
tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,
2400
VkBool32 depthBoundsTestEnable)
2401
{
2402
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2403
2404
cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
2405
2406
if (depthBoundsTestEnable)
2407
cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
2408
2409
cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2410
}
2411
2412
VKAPI_ATTR void VKAPI_CALL
2413
tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,
2414
VkBool32 stencilTestEnable)
2415
{
2416
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2417
2418
cmd->state.rb_stencil_cntl &= ~(
2419
A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2420
A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2421
A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2422
2423
if (stencilTestEnable) {
2424
cmd->state.rb_stencil_cntl |=
2425
A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2426
A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2427
A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2428
}
2429
2430
cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
2431
}
2432
2433
VKAPI_ATTR void VKAPI_CALL
2434
tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,
2435
VkStencilFaceFlags faceMask,
2436
VkStencilOp failOp,
2437
VkStencilOp passOp,
2438
VkStencilOp depthFailOp,
2439
VkCompareOp compareOp)
2440
{
2441
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2442
2443
if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
2444
cmd->state.rb_stencil_cntl &= ~(
2445
A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
2446
A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
2447
A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
2448
A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK);
2449
2450
cmd->state.rb_stencil_cntl |=
2451
A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) |
2452
A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) |
2453
A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) |
2454
A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp));
2455
}
2456
2457
if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
2458
cmd->state.rb_stencil_cntl &= ~(
2459
A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
2460
A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
2461
A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
2462
A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
2463
2464
cmd->state.rb_stencil_cntl |=
2465
A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) |
2466
A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) |
2467
A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) |
2468
A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp));
2469
}
2470
2471
cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
2472
}
2473
2474
static void
2475
tu_flush_for_access(struct tu_cache_state *cache,
2476
enum tu_cmd_access_mask src_mask,
2477
enum tu_cmd_access_mask dst_mask)
2478
{
2479
enum tu_cmd_flush_bits flush_bits = 0;
2480
2481
if (src_mask & TU_ACCESS_HOST_WRITE) {
2482
/* Host writes are always visible to CP, so only invalidate GPU caches */
2483
cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;
2484
}
2485
2486
if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2487
/* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as
2488
* well.
2489
*/
2490
cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2491
}
2492
2493
if (src_mask & TU_ACCESS_CP_WRITE) {
2494
/* Flush the CP write queue. However a WFI shouldn't be necessary as
2495
* WAIT_MEM_WRITES should cover it.
2496
*/
2497
cache->pending_flush_bits |=
2498
TU_CMD_FLAG_WAIT_MEM_WRITES |
2499
TU_CMD_FLAG_GPU_INVALIDATE |
2500
TU_CMD_FLAG_WAIT_FOR_ME;
2501
}
2502
2503
#define SRC_FLUSH(domain, flush, invalidate) \
2504
if (src_mask & TU_ACCESS_##domain##_WRITE) { \
2505
cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
2506
(TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2507
}
2508
2509
SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2510
SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2511
SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2512
2513
#undef SRC_FLUSH
2514
2515
#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
2516
if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
2517
flush_bits |= TU_CMD_FLAG_##flush; \
2518
cache->pending_flush_bits |= \
2519
(TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
2520
}
2521
2522
SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2523
SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2524
2525
#undef SRC_INCOHERENT_FLUSH
2526
2527
/* Treat host & sysmem write accesses the same, since the kernel implicitly
2528
* drains the queue before signalling completion to the host.
2529
*/
2530
if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |
2531
TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {
2532
flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2533
}
2534
2535
#define DST_FLUSH(domain, flush, invalidate) \
2536
if (dst_mask & (TU_ACCESS_##domain##_READ | \
2537
TU_ACCESS_##domain##_WRITE)) { \
2538
flush_bits |= cache->pending_flush_bits & \
2539
(TU_CMD_FLAG_##invalidate | \
2540
(TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2541
}
2542
2543
DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2544
DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2545
DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2546
2547
#undef DST_FLUSH
2548
2549
#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2550
if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ | \
2551
TU_ACCESS_##domain##_INCOHERENT_WRITE)) { \
2552
flush_bits |= TU_CMD_FLAG_##invalidate | \
2553
(cache->pending_flush_bits & \
2554
(TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
2555
}
2556
2557
DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2558
DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2559
2560
#undef DST_INCOHERENT_FLUSH
2561
2562
if (dst_mask & TU_ACCESS_WFI_READ) {
2563
flush_bits |= cache->pending_flush_bits &
2564
(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);
2565
}
2566
2567
if (dst_mask & TU_ACCESS_WFM_READ) {
2568
flush_bits |= cache->pending_flush_bits &
2569
(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);
2570
}
2571
2572
cache->flush_bits |= flush_bits;
2573
cache->pending_flush_bits &= ~flush_bits;
2574
}
2575
2576
static enum tu_cmd_access_mask
2577
vk2tu_access(VkAccessFlags flags, bool gmem)
2578
{
2579
enum tu_cmd_access_mask mask = 0;
2580
2581
/* If the GPU writes a buffer that is then read by an indirect draw
2582
* command, we theoretically need to emit a WFI to wait for any cache
2583
* flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to
2584
* complete. Waiting for the WFI to complete is performed as part of the
2585
* draw by the firmware, so we just need to execute the WFI.
2586
*
2587
* Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
2588
* does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
2589
*
2590
* Currently we read the draw predicate using CP_MEM_TO_MEM, which
2591
* also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
2592
* implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
2593
* complete since it's written for DX11 where you can only predicate on the
2594
* result of a query object. So if we implement 64-bit comparisons in the
2595
* future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
2596
* comparisons, then this will have to be dealt with.
2597
*/
2598
if (flags &
2599
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
2600
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
2601
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT |
2602
VK_ACCESS_MEMORY_READ_BIT)) {
2603
mask |= TU_ACCESS_WFI_READ;
2604
}
2605
2606
if (flags &
2607
(VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2608
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2609
VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */
2610
VK_ACCESS_MEMORY_READ_BIT)) {
2611
mask |= TU_ACCESS_SYSMEM_READ;
2612
}
2613
2614
if (flags &
2615
(VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
2616
VK_ACCESS_MEMORY_WRITE_BIT)) {
2617
mask |= TU_ACCESS_CP_WRITE;
2618
}
2619
2620
if (flags &
2621
(VK_ACCESS_HOST_READ_BIT |
2622
VK_ACCESS_MEMORY_WRITE_BIT)) {
2623
mask |= TU_ACCESS_HOST_READ;
2624
}
2625
2626
if (flags &
2627
(VK_ACCESS_HOST_WRITE_BIT |
2628
VK_ACCESS_MEMORY_WRITE_BIT)) {
2629
mask |= TU_ACCESS_HOST_WRITE;
2630
}
2631
2632
if (flags &
2633
(VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2634
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2635
VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2636
/* TODO: Is there a no-cache bit for textures so that we can ignore
2637
* these?
2638
*/
2639
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2640
VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2641
VK_ACCESS_MEMORY_READ_BIT)) {
2642
mask |= TU_ACCESS_UCHE_READ;
2643
}
2644
2645
if (flags &
2646
(VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2647
VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2648
VK_ACCESS_MEMORY_WRITE_BIT)) {
2649
mask |= TU_ACCESS_UCHE_WRITE;
2650
}
2651
2652
/* When using GMEM, the CCU is always flushed automatically to GMEM, and
2653
* then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2654
* previous writes in sysmem mode when transitioning to GMEM. Therefore we
2655
* can ignore CCU and pretend that color attachments and transfers use
2656
* sysmem directly.
2657
*/
2658
2659
if (flags &
2660
(VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2661
VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2662
VK_ACCESS_MEMORY_READ_BIT)) {
2663
if (gmem)
2664
mask |= TU_ACCESS_SYSMEM_READ;
2665
else
2666
mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2667
}
2668
2669
if (flags &
2670
(VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2671
VK_ACCESS_MEMORY_READ_BIT)) {
2672
if (gmem)
2673
mask |= TU_ACCESS_SYSMEM_READ;
2674
else
2675
mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2676
}
2677
2678
if (flags &
2679
(VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2680
VK_ACCESS_MEMORY_WRITE_BIT)) {
2681
if (gmem) {
2682
mask |= TU_ACCESS_SYSMEM_WRITE;
2683
} else {
2684
mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2685
}
2686
}
2687
2688
if (flags &
2689
(VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2690
VK_ACCESS_MEMORY_WRITE_BIT)) {
2691
if (gmem) {
2692
mask |= TU_ACCESS_SYSMEM_WRITE;
2693
} else {
2694
mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2695
}
2696
}
2697
2698
/* When the dst access is a transfer read/write, it seems we sometimes need
2699
* to insert a WFI after any flushes, to guarantee that the flushes finish
2700
* before the 2D engine starts. However the opposite (i.e. a WFI after
2701
* CP_BLIT and before any subsequent flush) does not seem to be needed, and
2702
* the blob doesn't emit such a WFI.
2703
*/
2704
2705
if (flags &
2706
(VK_ACCESS_TRANSFER_WRITE_BIT |
2707
VK_ACCESS_MEMORY_WRITE_BIT)) {
2708
if (gmem) {
2709
mask |= TU_ACCESS_SYSMEM_WRITE;
2710
} else {
2711
mask |= TU_ACCESS_CCU_COLOR_WRITE;
2712
}
2713
mask |= TU_ACCESS_WFI_READ;
2714
}
2715
2716
if (flags &
2717
(VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2718
VK_ACCESS_MEMORY_READ_BIT)) {
2719
mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
2720
}
2721
2722
return mask;
2723
}
2724
2725
2726
VKAPI_ATTR void VKAPI_CALL
2727
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2728
uint32_t commandBufferCount,
2729
const VkCommandBuffer *pCmdBuffers)
2730
{
2731
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2732
VkResult result;
2733
2734
assert(commandBufferCount > 0);
2735
2736
/* Emit any pending flushes. */
2737
if (cmd->state.pass) {
2738
tu_flush_all_pending(&cmd->state.renderpass_cache);
2739
tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2740
} else {
2741
tu_flush_all_pending(&cmd->state.cache);
2742
tu_emit_cache_flush(cmd, &cmd->cs);
2743
}
2744
2745
for (uint32_t i = 0; i < commandBufferCount; i++) {
2746
TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2747
2748
if (secondary->usage_flags &
2749
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2750
assert(tu_cs_is_empty(&secondary->cs));
2751
2752
result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2753
if (result != VK_SUCCESS) {
2754
cmd->record_result = result;
2755
break;
2756
}
2757
2758
result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2759
&secondary->draw_epilogue_cs);
2760
if (result != VK_SUCCESS) {
2761
cmd->record_result = result;
2762
break;
2763
}
2764
2765
if (secondary->state.has_tess)
2766
cmd->state.has_tess = true;
2767
if (secondary->state.has_subpass_predication)
2768
cmd->state.has_subpass_predication = true;
2769
} else {
2770
assert(tu_cs_is_empty(&secondary->draw_cs));
2771
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2772
2773
tu_cs_add_entries(&cmd->cs, &secondary->cs);
2774
}
2775
2776
cmd->state.index_size = secondary->state.index_size; /* for restart index update */
2777
}
2778
cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2779
2780
if (cmd->state.pass) {
2781
/* After a secondary command buffer is executed, LRZ is not valid
2782
* until it is cleared again.
2783
*/
2784
cmd->state.lrz.valid = false;
2785
}
2786
2787
/* After executing secondary command buffers, there may have been arbitrary
2788
* flushes executed, so when we encounter a pipeline barrier with a
2789
* srcMask, we have to assume that we need to invalidate. Therefore we need
2790
* to re-initialize the cache with all pending invalidate bits set.
2791
*/
2792
if (cmd->state.pass) {
2793
tu_cache_init(&cmd->state.renderpass_cache);
2794
} else {
2795
tu_cache_init(&cmd->state.cache);
2796
}
2797
}
2798
2799
VKAPI_ATTR VkResult VKAPI_CALL
2800
tu_CreateCommandPool(VkDevice _device,
2801
const VkCommandPoolCreateInfo *pCreateInfo,
2802
const VkAllocationCallbacks *pAllocator,
2803
VkCommandPool *pCmdPool)
2804
{
2805
TU_FROM_HANDLE(tu_device, device, _device);
2806
struct tu_cmd_pool *pool;
2807
2808
pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
2809
VK_OBJECT_TYPE_COMMAND_POOL);
2810
if (pool == NULL)
2811
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
2812
2813
if (pAllocator)
2814
pool->alloc = *pAllocator;
2815
else
2816
pool->alloc = device->vk.alloc;
2817
2818
list_inithead(&pool->cmd_buffers);
2819
list_inithead(&pool->free_cmd_buffers);
2820
2821
pool->queue_family_index = pCreateInfo->queueFamilyIndex;
2822
2823
*pCmdPool = tu_cmd_pool_to_handle(pool);
2824
2825
return VK_SUCCESS;
2826
}
2827
2828
VKAPI_ATTR void VKAPI_CALL
2829
tu_DestroyCommandPool(VkDevice _device,
2830
VkCommandPool commandPool,
2831
const VkAllocationCallbacks *pAllocator)
2832
{
2833
TU_FROM_HANDLE(tu_device, device, _device);
2834
TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2835
2836
if (!pool)
2837
return;
2838
2839
list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2840
&pool->cmd_buffers, pool_link)
2841
{
2842
tu_cmd_buffer_destroy(cmd_buffer);
2843
}
2844
2845
list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2846
&pool->free_cmd_buffers, pool_link)
2847
{
2848
tu_cmd_buffer_destroy(cmd_buffer);
2849
}
2850
2851
vk_object_free(&device->vk, pAllocator, pool);
2852
}
2853
2854
VKAPI_ATTR VkResult VKAPI_CALL
2855
tu_ResetCommandPool(VkDevice device,
2856
VkCommandPool commandPool,
2857
VkCommandPoolResetFlags flags)
2858
{
2859
TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2860
VkResult result;
2861
2862
list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
2863
pool_link)
2864
{
2865
result = tu_reset_cmd_buffer(cmd_buffer);
2866
if (result != VK_SUCCESS)
2867
return result;
2868
}
2869
2870
return VK_SUCCESS;
2871
}
2872
2873
VKAPI_ATTR void VKAPI_CALL
2874
tu_TrimCommandPool(VkDevice device,
2875
VkCommandPool commandPool,
2876
VkCommandPoolTrimFlags flags)
2877
{
2878
TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
2879
2880
if (!pool)
2881
return;
2882
2883
list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
2884
&pool->free_cmd_buffers, pool_link)
2885
{
2886
tu_cmd_buffer_destroy(cmd_buffer);
2887
}
2888
}
2889
2890
static void
2891
tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
2892
const struct tu_subpass_barrier *barrier,
2893
bool external)
2894
{
2895
/* Note: we don't know until the end of the subpass whether we'll use
2896
* sysmem, so assume sysmem here to be safe.
2897
*/
2898
struct tu_cache_state *cache =
2899
external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
2900
enum tu_cmd_access_mask src_flags =
2901
vk2tu_access(barrier->src_access_mask, false);
2902
enum tu_cmd_access_mask dst_flags =
2903
vk2tu_access(barrier->dst_access_mask, false);
2904
2905
if (barrier->incoherent_ccu_color)
2906
src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2907
if (barrier->incoherent_ccu_depth)
2908
src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2909
2910
tu_flush_for_access(cache, src_flags, dst_flags);
2911
}
2912
2913
VKAPI_ATTR void VKAPI_CALL
2914
tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
2915
const VkRenderPassBeginInfo *pRenderPassBegin,
2916
const VkSubpassBeginInfo *pSubpassBeginInfo)
2917
{
2918
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2919
TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
2920
TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
2921
2922
cmd->state.pass = pass;
2923
cmd->state.subpass = pass->subpasses;
2924
cmd->state.framebuffer = fb;
2925
cmd->state.render_area = pRenderPassBegin->renderArea;
2926
2927
tu_cmd_prepare_tile_store_ib(cmd);
2928
2929
/* Note: because this is external, any flushes will happen before draw_cs
2930
* gets called. However deferred flushes could have to happen later as part
2931
* of the subpass.
2932
*/
2933
tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
2934
cmd->state.renderpass_cache.pending_flush_bits =
2935
cmd->state.cache.pending_flush_bits;
2936
cmd->state.renderpass_cache.flush_bits = 0;
2937
2938
/* Track LRZ valid state */
2939
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
2940
if (a != VK_ATTACHMENT_UNUSED) {
2941
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
2942
struct tu_image *image = fb->attachments[a].attachment->image;
2943
/* if image has lrz and it isn't a stencil-only clear: */
2944
if (image->lrz_height &&
2945
(att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
2946
cmd->state.lrz.image = image;
2947
cmd->state.lrz.valid = true;
2948
cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
2949
2950
tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
2951
tu6_emit_event_write(cmd, &cmd->cs, PC_CCU_FLUSH_COLOR_TS);
2952
} else {
2953
cmd->state.lrz.valid = false;
2954
}
2955
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2956
}
2957
2958
tu_emit_renderpass_begin(cmd, pRenderPassBegin);
2959
2960
tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
2961
tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
2962
if (cmd->state.subpass->samples)
2963
tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
2964
tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
2965
2966
tu_set_input_attachments(cmd, cmd->state.subpass);
2967
}
2968
2969
VKAPI_ATTR void VKAPI_CALL
2970
tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
2971
const VkSubpassBeginInfo *pSubpassBeginInfo,
2972
const VkSubpassEndInfo *pSubpassEndInfo)
2973
{
2974
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2975
const struct tu_render_pass *pass = cmd->state.pass;
2976
struct tu_cs *cs = &cmd->draw_cs;
2977
2978
const struct tu_subpass *subpass = cmd->state.subpass++;
2979
2980
/* Track LRZ valid state
2981
*
2982
* TODO: Improve this tracking for keeping the state of the past depth/stencil images,
2983
* so if they become active again, we reuse its old state.
2984
*/
2985
cmd->state.lrz.valid = false;
2986
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2987
2988
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2989
2990
if (subpass->resolve_attachments) {
2991
tu6_emit_blit_scissor(cmd, cs, true);
2992
2993
for (unsigned i = 0; i < subpass->resolve_count; i++) {
2994
uint32_t a = subpass->resolve_attachments[i].attachment;
2995
if (a == VK_ATTACHMENT_UNUSED)
2996
continue;
2997
2998
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
2999
3000
tu_store_gmem_attachment(cmd, cs, a, gmem_a);
3001
3002
if (pass->attachments[a].gmem_offset < 0)
3003
continue;
3004
3005
/* TODO:
3006
* check if the resolved attachment is needed by later subpasses,
3007
* if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
3008
*/
3009
tu_finishme("missing GMEM->GMEM resolve path\n");
3010
tu_load_gmem_attachment(cmd, cs, a, true);
3011
}
3012
}
3013
3014
tu_cond_exec_end(cs);
3015
3016
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3017
3018
tu6_emit_sysmem_resolves(cmd, cs, subpass);
3019
3020
tu_cond_exec_end(cs);
3021
3022
/* Handle dependencies for the next subpass */
3023
tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
3024
3025
/* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
3026
tu6_emit_zs(cmd, cmd->state.subpass, cs);
3027
tu6_emit_mrt(cmd, cmd->state.subpass, cs);
3028
if (cmd->state.subpass->samples)
3029
tu6_emit_msaa(cs, cmd->state.subpass->samples);
3030
tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
3031
3032
tu_set_input_attachments(cmd, cmd->state.subpass);
3033
}
3034
3035
static uint32_t
3036
tu6_user_consts_size(const struct tu_pipeline *pipeline,
3037
struct tu_descriptor_state *descriptors_state,
3038
gl_shader_stage type)
3039
{
3040
const struct tu_program_descriptor_linkage *link =
3041
&pipeline->program.link[type];
3042
const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
3043
uint32_t dwords = 0;
3044
3045
if (link->push_consts.count > 0) {
3046
unsigned num_units = link->push_consts.count;
3047
dwords += 4 + num_units * 4;
3048
}
3049
3050
for (uint32_t i = 0; i < state->num_enabled; i++) {
3051
uint32_t size = state->range[i].end - state->range[i].start;
3052
3053
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
3054
3055
if (size == 0)
3056
continue;
3057
3058
if (!state->range[i].ubo.bindless)
3059
continue;
3060
3061
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
3062
descriptors_state->dynamic_descriptors :
3063
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
3064
unsigned block = state->range[i].ubo.block;
3065
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
3066
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
3067
desc_size = desc_size > state->range[i].start ?
3068
desc_size - state->range[i].start : 0;
3069
3070
if (desc_size < size) {
3071
uint32_t zero_size = size - desc_size;
3072
dwords += 4 + zero_size / 4;
3073
size = desc_size;
3074
}
3075
3076
if (size > 0) {
3077
dwords += 4;
3078
}
3079
}
3080
3081
return dwords;
3082
}
3083
3084
static void
3085
tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
3086
struct tu_descriptor_state *descriptors_state,
3087
gl_shader_stage type,
3088
uint32_t *push_constants)
3089
{
3090
const struct tu_program_descriptor_linkage *link =
3091
&pipeline->program.link[type];
3092
const struct ir3_const_state *const_state = &link->const_state;
3093
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
3094
3095
if (link->push_consts.count > 0) {
3096
unsigned num_units = link->push_consts.count;
3097
unsigned offset = link->push_consts.lo;
3098
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
3099
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3100
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3101
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3102
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3103
CP_LOAD_STATE6_0_NUM_UNIT(num_units));
3104
tu_cs_emit(cs, 0);
3105
tu_cs_emit(cs, 0);
3106
for (unsigned i = 0; i < num_units * 4; i++)
3107
tu_cs_emit(cs, push_constants[i + offset * 4]);
3108
}
3109
3110
for (uint32_t i = 0; i < state->num_enabled; i++) {
3111
uint32_t size = state->range[i].end - state->range[i].start;
3112
uint32_t offset = state->range[i].start;
3113
3114
/* and even if the start of the const buffer is before
3115
* first_immediate, the end may not be:
3116
*/
3117
size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
3118
3119
if (size == 0)
3120
continue;
3121
3122
/* things should be aligned to vec4: */
3123
debug_assert((state->range[i].offset % 16) == 0);
3124
debug_assert((size % 16) == 0);
3125
debug_assert((offset % 16) == 0);
3126
3127
/* Dig out the descriptor from the descriptor state and read the VA from
3128
* it. All our UBOs are bindless with the exception of the NIR
3129
* constant_data, which is uploaded once in the pipeline.
3130
*/
3131
if (!state->range[i].ubo.bindless) {
3132
assert(state->range[i].ubo.block == const_state->constant_data_ubo);
3133
continue;
3134
}
3135
3136
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
3137
descriptors_state->dynamic_descriptors :
3138
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
3139
unsigned block = state->range[i].ubo.block;
3140
uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
3141
uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
3142
uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
3143
desc_size = desc_size > state->range[i].start ?
3144
desc_size - state->range[i].start : 0;
3145
3146
/* Handle null UBO descriptors and out-of-range UBO reads by filling the
3147
* rest with 0, simulating what reading with ldc would do. This behavior
3148
* is required by VK_EXT_robustness2.
3149
*/
3150
if (desc_size < size) {
3151
uint32_t zero_size = size - desc_size;
3152
uint32_t zero_offset = state->range[i].offset + desc_size;
3153
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);
3154
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |
3155
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3156
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3157
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3158
CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));
3159
tu_cs_emit_qw(cs, 0);
3160
for (unsigned i = 0; i < zero_size / 4; i++) {
3161
tu_cs_emit(cs, 0);
3162
}
3163
size = desc_size;
3164
}
3165
3166
if (size > 0) {
3167
assert(va);
3168
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
3169
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
3170
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3171
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3172
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3173
CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
3174
tu_cs_emit_qw(cs, va + offset);
3175
}
3176
}
3177
}
3178
3179
static struct tu_draw_state
3180
tu6_emit_consts(struct tu_cmd_buffer *cmd,
3181
const struct tu_pipeline *pipeline,
3182
struct tu_descriptor_state *descriptors_state,
3183
gl_shader_stage type)
3184
{
3185
uint32_t dwords = tu6_user_consts_size(pipeline, descriptors_state, type);
3186
if (dwords == 0)
3187
return (struct tu_draw_state) {};
3188
3189
struct tu_cs cs;
3190
tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
3191
3192
tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3193
3194
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3195
}
3196
3197
static struct tu_draw_state
3198
tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
3199
const struct tu_pipeline *pipeline,
3200
struct tu_descriptor_state *descriptors_state)
3201
{
3202
uint32_t dwords = 0;
3203
3204
for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
3205
dwords += tu6_user_consts_size(pipeline, descriptors_state, type);
3206
3207
if (dwords == 0)
3208
return (struct tu_draw_state) {};
3209
3210
struct tu_cs cs;
3211
tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
3212
3213
for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
3214
tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3215
3216
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3217
}
3218
3219
static uint64_t
3220
get_tess_param_bo_size(const struct tu_pipeline *pipeline,
3221
uint32_t draw_count)
3222
{
3223
/* TODO: For indirect draws, we can't compute the BO size ahead of time.
3224
* Still not sure what to do here, so just allocate a reasonably large
3225
* BO and hope for the best for now. */
3226
if (!draw_count)
3227
draw_count = 2048;
3228
3229
/* the tess param BO is pipeline->tess.param_stride bytes per patch,
3230
* which includes both the per-vertex outputs and per-patch outputs
3231
* build_primitive_map in ir3 calculates this stride
3232
*/
3233
uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
3234
uint32_t num_patches = draw_count / verts_per_patch;
3235
return num_patches * pipeline->tess.param_stride;
3236
}
3237
3238
static uint64_t
3239
get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
3240
uint32_t draw_count)
3241
{
3242
/* TODO: For indirect draws, we can't compute the BO size ahead of time.
3243
* Still not sure what to do here, so just allocate a reasonably large
3244
* BO and hope for the best for now. */
3245
if (!draw_count)
3246
draw_count = 2048;
3247
3248
/* Each distinct patch gets its own tess factor output. */
3249
uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
3250
uint32_t num_patches = draw_count / verts_per_patch;
3251
uint32_t factor_stride;
3252
switch (pipeline->tess.patch_type) {
3253
case IR3_TESS_ISOLINES:
3254
factor_stride = 12;
3255
break;
3256
case IR3_TESS_TRIANGLES:
3257
factor_stride = 20;
3258
break;
3259
case IR3_TESS_QUADS:
3260
factor_stride = 28;
3261
break;
3262
default:
3263
unreachable("bad tessmode");
3264
}
3265
return factor_stride * num_patches;
3266
}
3267
3268
static VkResult
3269
tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
3270
uint32_t draw_count,
3271
const struct tu_pipeline *pipeline,
3272
struct tu_draw_state *state,
3273
uint64_t *factor_iova)
3274
{
3275
struct tu_cs cs;
3276
VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
3277
if (result != VK_SUCCESS)
3278
return result;
3279
3280
const struct tu_program_descriptor_linkage *hs_link =
3281
&pipeline->program.link[MESA_SHADER_TESS_CTRL];
3282
bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;
3283
3284
const struct tu_program_descriptor_linkage *ds_link =
3285
&pipeline->program.link[MESA_SHADER_TESS_EVAL];
3286
bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;
3287
3288
uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
3289
uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
3290
uint64_t tess_bo_size = tess_factor_size + tess_param_size;
3291
if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {
3292
struct tu_bo *tess_bo;
3293
result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
3294
if (result != VK_SUCCESS)
3295
return result;
3296
3297
uint64_t tess_factor_iova = tess_bo->iova;
3298
uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
3299
3300
if (hs_uses_bo) {
3301
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3302
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
3303
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3304
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3305
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
3306
CP_LOAD_STATE6_0_NUM_UNIT(1));
3307
tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
3308
tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
3309
tu_cs_emit_qw(&cs, tess_param_iova);
3310
tu_cs_emit_qw(&cs, tess_factor_iova);
3311
}
3312
3313
if (ds_uses_bo) {
3314
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3315
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
3316
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3317
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3318
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
3319
CP_LOAD_STATE6_0_NUM_UNIT(1));
3320
tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
3321
tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
3322
tu_cs_emit_qw(&cs, tess_param_iova);
3323
tu_cs_emit_qw(&cs, tess_factor_iova);
3324
}
3325
3326
*factor_iova = tess_factor_iova;
3327
}
3328
*state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3329
return VK_SUCCESS;
3330
}
3331
3332
static enum tu_lrz_direction
3333
tu6_lrz_depth_mode(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
3334
VkCompareOp depthCompareOp,
3335
bool *invalidate_lrz)
3336
{
3337
enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
3338
3339
/* LRZ does not support some depth modes. */
3340
switch (depthCompareOp) {
3341
case VK_COMPARE_OP_ALWAYS:
3342
case VK_COMPARE_OP_NOT_EQUAL:
3343
*invalidate_lrz = true;
3344
gras_lrz_cntl->lrz_write = false;
3345
break;
3346
case VK_COMPARE_OP_EQUAL:
3347
case VK_COMPARE_OP_NEVER:
3348
gras_lrz_cntl->lrz_write = false;
3349
break;
3350
case VK_COMPARE_OP_GREATER:
3351
case VK_COMPARE_OP_GREATER_OR_EQUAL:
3352
lrz_direction = TU_LRZ_GREATER;
3353
gras_lrz_cntl->greater = true;
3354
break;
3355
case VK_COMPARE_OP_LESS:
3356
case VK_COMPARE_OP_LESS_OR_EQUAL:
3357
lrz_direction = TU_LRZ_LESS;
3358
break;
3359
default:
3360
unreachable("bad VK_COMPARE_OP value or uninitialized");
3361
break;
3362
};
3363
3364
return lrz_direction;
3365
}
3366
3367
/* update lrz state based on stencil-test func:
3368
*
3369
* Conceptually the order of the pipeline is:
3370
*
3371
*
3372
* FS -> Alpha-Test -> Stencil-Test -> Depth-Test
3373
* | |
3374
* if wrmask != 0 if wrmask != 0
3375
* | |
3376
* v v
3377
* Stencil-Write Depth-Write
3378
*
3379
* Because Stencil-Test can have side effects (Stencil-Write) prior
3380
* to depth test, in this case we potentially need to disable early
3381
* lrz-test. See:
3382
*
3383
* https://www.khronos.org/opengl/wiki/Per-Sample_Processing
3384
*/
3385
static void
3386
tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
3387
VkCompareOp func,
3388
bool stencil_write,
3389
bool *invalidate_lrz)
3390
{
3391
switch (func) {
3392
case VK_COMPARE_OP_ALWAYS:
3393
/* nothing to do for LRZ, but for stencil test when stencil-
3394
* write is enabled, we need to disable lrz-test, since
3395
* conceptually stencil test and write happens before depth-test.
3396
*/
3397
if (stencil_write) {
3398
gras_lrz_cntl->enable = false;
3399
gras_lrz_cntl->z_test_enable = false;
3400
*invalidate_lrz = true;
3401
}
3402
break;
3403
case VK_COMPARE_OP_NEVER:
3404
/* fragment never passes, disable lrz_write for this draw. */
3405
gras_lrz_cntl->lrz_write = false;
3406
break;
3407
default:
3408
/* whether the fragment passes or not depends on result
3409
* of stencil test, which we cannot know when doing binning
3410
* pass.
3411
*/
3412
gras_lrz_cntl->lrz_write = false;
3413
/* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
3414
* effects from stencil test we need to disable lrz-test.
3415
*/
3416
if (stencil_write) {
3417
gras_lrz_cntl->enable = false;
3418
gras_lrz_cntl->z_test_enable = false;
3419
*invalidate_lrz = true;
3420
}
3421
break;
3422
}
3423
}
3424
3425
static struct A6XX_GRAS_LRZ_CNTL
3426
tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
3427
const uint32_t a)
3428
{
3429
struct tu_pipeline *pipeline = cmd->state.pipeline;
3430
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
3431
bool invalidate_lrz = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ;
3432
bool force_disable_write = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE;
3433
enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
3434
3435
gras_lrz_cntl.enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE;
3436
gras_lrz_cntl.lrz_write = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3437
gras_lrz_cntl.z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
3438
gras_lrz_cntl.z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
3439
3440
VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
3441
lrz_direction = tu6_lrz_depth_mode(&gras_lrz_cntl, depth_compare_op, &invalidate_lrz);
3442
3443
/* LRZ doesn't transition properly between GREATER* and LESS* depth compare ops */
3444
if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
3445
lrz_direction != TU_LRZ_UNKNOWN &&
3446
cmd->state.lrz.prev_direction != lrz_direction) {
3447
invalidate_lrz = true;
3448
}
3449
3450
cmd->state.lrz.prev_direction = lrz_direction;
3451
3452
/* Invalidate LRZ and disable write if stencil test is enabled */
3453
bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
3454
if (stencil_test_enable) {
3455
bool stencil_front_writemask =
3456
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3457
(cmd->state.dynamic_stencil_wrmask & 0xff) :
3458
(pipeline->stencil_wrmask & 0xff);
3459
3460
bool stencil_back_writemask =
3461
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3462
((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
3463
(pipeline->stencil_wrmask & 0xff00) >> 8;
3464
3465
VkCompareOp stencil_front_compare_op =
3466
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
3467
3468
VkCompareOp stencil_back_compare_op =
3469
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
3470
3471
tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,
3472
stencil_front_writemask, &invalidate_lrz);
3473
3474
tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,
3475
stencil_back_writemask, &invalidate_lrz);
3476
}
3477
3478
if (force_disable_write)
3479
gras_lrz_cntl.lrz_write = false;
3480
3481
if (invalidate_lrz) {
3482
cmd->state.lrz.valid = false;
3483
}
3484
3485
/* In case no depth attachment or invalid, we clear the gras_lrz_cntl register */
3486
if (a == VK_ATTACHMENT_UNUSED || !cmd->state.lrz.valid)
3487
memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
3488
3489
return gras_lrz_cntl;
3490
}
3491
3492
static struct tu_draw_state
3493
tu6_build_lrz(struct tu_cmd_buffer *cmd)
3494
{
3495
const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
3496
struct tu_cs lrz_cs;
3497
struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &lrz_cs, 4);
3498
3499
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
3500
3501
tu_cs_emit_regs(&lrz_cs, A6XX_GRAS_LRZ_CNTL(
3502
.enable = gras_lrz_cntl.enable,
3503
.greater = gras_lrz_cntl.greater,
3504
.lrz_write = gras_lrz_cntl.lrz_write,
3505
.z_test_enable = gras_lrz_cntl.z_test_enable,
3506
.z_bounds_enable = gras_lrz_cntl.z_bounds_enable));
3507
tu_cs_emit_regs(&lrz_cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
3508
3509
return ds;
3510
}
3511
3512
static bool
3513
tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
3514
{
3515
bool depth_write_enable =
3516
cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3517
3518
VkCompareOp depth_compare_op =
3519
(cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
3520
3521
bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;
3522
3523
return depth_test_enable && depth_write_enable && depth_compare_op_writes;
3524
}
3525
3526
static bool
3527
tu6_writes_stencil(struct tu_cmd_buffer *cmd)
3528
{
3529
bool stencil_test_enable =
3530
cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
3531
3532
bool stencil_front_writemask =
3533
(cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3534
(cmd->state.dynamic_stencil_wrmask & 0xff) :
3535
(cmd->state.pipeline->stencil_wrmask & 0xff);
3536
3537
bool stencil_back_writemask =
3538
(cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3539
((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
3540
(cmd->state.pipeline->stencil_wrmask & 0xff00) >> 8;
3541
3542
VkStencilOp front_fail_op =
3543
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT;
3544
VkStencilOp front_pass_op =
3545
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT;
3546
VkStencilOp front_depth_fail_op =
3547
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT;
3548
VkStencilOp back_fail_op =
3549
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT;
3550
VkStencilOp back_pass_op =
3551
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT;
3552
VkStencilOp back_depth_fail_op =
3553
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT;
3554
3555
bool stencil_front_op_writes =
3556
front_pass_op != VK_STENCIL_OP_KEEP &&
3557
front_fail_op != VK_STENCIL_OP_KEEP &&
3558
front_depth_fail_op != VK_STENCIL_OP_KEEP;
3559
3560
bool stencil_back_op_writes =
3561
back_pass_op != VK_STENCIL_OP_KEEP &&
3562
back_fail_op != VK_STENCIL_OP_KEEP &&
3563
back_depth_fail_op != VK_STENCIL_OP_KEEP;
3564
3565
return stencil_test_enable &&
3566
((stencil_front_writemask && stencil_front_op_writes) ||
3567
(stencil_back_writemask && stencil_back_op_writes));
3568
}
3569
3570
static struct tu_draw_state
3571
tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd)
3572
{
3573
struct tu_cs cs;
3574
struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 4);
3575
3576
enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;
3577
bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE;
3578
bool depth_write = tu6_writes_depth(cmd, depth_test_enable);
3579
bool stencil_write = tu6_writes_stencil(cmd);
3580
3581
if (cmd->state.pipeline->lrz.fs_has_kill &&
3582
(depth_write || stencil_write)) {
3583
zmode = cmd->state.lrz.valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
3584
}
3585
3586
if (cmd->state.pipeline->lrz.force_late_z || !depth_test_enable)
3587
zmode = A6XX_LATE_Z;
3588
3589
/* User defined early tests take precedence above all else */
3590
if (cmd->state.pipeline->lrz.early_fragment_tests)
3591
zmode = A6XX_EARLY_Z;
3592
3593
tu_cs_emit_pkt4(&cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
3594
tu_cs_emit(&cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
3595
3596
tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
3597
tu_cs_emit(&cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
3598
return ds;
3599
}
3600
3601
static VkResult
3602
tu6_draw_common(struct tu_cmd_buffer *cmd,
3603
struct tu_cs *cs,
3604
bool indexed,
3605
/* note: draw_count is 0 for indirect */
3606
uint32_t draw_count)
3607
{
3608
const struct tu_pipeline *pipeline = cmd->state.pipeline;
3609
VkResult result;
3610
3611
tu_emit_cache_flush_renderpass(cmd, cs);
3612
3613
tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
3614
.primitive_restart =
3615
pipeline->ia.primitive_restart && indexed,
3616
.provoking_vtx_last = pipeline->provoking_vertex_last,
3617
.tess_upper_left_domain_origin =
3618
pipeline->tess.upper_left_domain_origin));
3619
3620
bool has_tess =
3621
pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
3622
3623
/* Early exit if there is nothing to emit, saves CPU cycles */
3624
if (!(cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) &&
3625
!has_tess)
3626
return VK_SUCCESS;
3627
3628
bool dirty_lrz = cmd->state.dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | TU_CMD_DIRTY_RB_STENCIL_CNTL);
3629
3630
struct tu_descriptor_state *descriptors_state =
3631
&cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3632
3633
if (dirty_lrz) {
3634
cmd->state.lrz.state = tu6_build_lrz(cmd);
3635
cmd->state.depth_plane_state = tu6_build_depth_plane_z_mode(cmd);
3636
}
3637
3638
if (cmd->state.dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) {
3639
struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2);
3640
tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl));
3641
}
3642
3643
if (cmd->state.dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) {
3644
struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2);
3645
uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl;
3646
3647
if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE) ||
3648
(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE))
3649
rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
3650
3651
if (pipeline->rb_depth_cntl_disable)
3652
rb_depth_cntl = 0;
3653
3654
tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl));
3655
}
3656
3657
if (cmd->state.dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) {
3658
struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2);
3659
tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl));
3660
}
3661
3662
if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3663
cmd->state.shader_const[0] =
3664
tu6_emit_consts_geom(cmd, pipeline, descriptors_state);
3665
cmd->state.shader_const[1] =
3666
tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
3667
}
3668
3669
struct tu_draw_state tess_consts = {};
3670
if (has_tess) {
3671
uint64_t tess_factor_iova = 0;
3672
3673
cmd->state.has_tess = true;
3674
result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
3675
if (result != VK_SUCCESS)
3676
return result;
3677
3678
/* this sequence matches what the blob does before every tess draw
3679
* PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
3680
* before writing to it
3681
*/
3682
tu_cs_emit_wfi(cs);
3683
3684
tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));
3685
3686
tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
3687
tu_cs_emit(cs, draw_count);
3688
}
3689
3690
/* for the first draw in a renderpass, re-emit all the draw states
3691
*
3692
* and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
3693
* used, then draw states must be re-emitted. note however this only happens
3694
* in the sysmem path, so this can be skipped this for the gmem path (TODO)
3695
*
3696
* the two input attachment states are excluded because secondary command
3697
* buffer doesn't have a state ib to restore it, and not re-emitting them
3698
* is OK since CmdClearAttachments won't disable/overwrite them
3699
*/
3700
if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
3701
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3702
3703
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
3704
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
3705
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
3706
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
3707
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
3708
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
3709
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
3710
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
3711
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
3712
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
3713
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
3714
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
3715
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
3716
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3717
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
3718
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
3719
3720
for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
3721
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
3722
((pipeline->dynamic_state_mask & BIT(i)) ?
3723
cmd->state.dynamic_state[i] :
3724
pipeline->dynamic_state[i]));
3725
}
3726
} else {
3727
/* emit draw states that were just updated
3728
* note we eventually don't want to have to emit anything here
3729
*/
3730
bool emit_binding_stride = false;
3731
uint32_t draw_state_count =
3732
has_tess +
3733
((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
3734
((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
3735
((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
3736
((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
3737
(dirty_lrz ? 2 : 0);
3738
3739
if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) &&
3740
(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
3741
emit_binding_stride = true;
3742
draw_state_count += 1;
3743
}
3744
3745
if (draw_state_count > 0)
3746
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
3747
3748
/* We may need to re-emit tess consts if the current draw call is
3749
* sufficiently larger than the last draw call. */
3750
if (has_tess)
3751
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
3752
if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3753
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
3754
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
3755
}
3756
if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)
3757
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
3758
if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
3759
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
3760
if (emit_binding_stride) {
3761
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,
3762
cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);
3763
}
3764
if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS)
3765
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3766
3767
if (dirty_lrz) {
3768
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
3769
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
3770
}
3771
}
3772
3773
tu_cs_sanity_check(cs);
3774
3775
/* There are too many graphics dirty bits to list here, so just list the
3776
* bits to preserve instead. The only things not emitted here are
3777
* compute-related state.
3778
*/
3779
cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
3780
return VK_SUCCESS;
3781
}
3782
3783
static uint32_t
3784
tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
3785
{
3786
const struct tu_pipeline *pipeline = cmd->state.pipeline;
3787
enum pc_di_primtype primtype = pipeline->ia.primtype;
3788
3789
if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY))
3790
primtype = cmd->state.primtype;
3791
3792
uint32_t initiator =
3793
CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
3794
CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
3795
CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
3796
CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
3797
3798
if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
3799
initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
3800
3801
switch (pipeline->tess.patch_type) {
3802
case IR3_TESS_TRIANGLES:
3803
initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
3804
CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3805
break;
3806
case IR3_TESS_ISOLINES:
3807
initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
3808
CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3809
break;
3810
case IR3_TESS_NONE:
3811
initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
3812
break;
3813
case IR3_TESS_QUADS:
3814
initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
3815
CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
3816
break;
3817
}
3818
return initiator;
3819
}
3820
3821
3822
static uint32_t
3823
vs_params_offset(struct tu_cmd_buffer *cmd)
3824
{
3825
const struct tu_program_descriptor_linkage *link =
3826
&cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
3827
const struct ir3_const_state *const_state = &link->const_state;
3828
3829
if (const_state->offsets.driver_param >= link->constlen)
3830
return 0;
3831
3832
/* this layout is required by CP_DRAW_INDIRECT_MULTI */
3833
STATIC_ASSERT(IR3_DP_DRAWID == 0);
3834
STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
3835
STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
3836
3837
/* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
3838
assert(const_state->offsets.driver_param != 0);
3839
3840
return const_state->offsets.driver_param;
3841
}
3842
3843
static void
3844
tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
3845
{
3846
if (cmd->state.vs_params.iova) {
3847
cmd->state.vs_params = (struct tu_draw_state) {};
3848
cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
3849
}
3850
}
3851
3852
static void
3853
tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
3854
uint32_t vertex_offset,
3855
uint32_t first_instance)
3856
{
3857
uint32_t offset = vs_params_offset(cmd);
3858
3859
if (offset == cmd->state.last_vs_params.params_offset &&
3860
vertex_offset == cmd->state.last_vs_params.vertex_offset &&
3861
first_instance == cmd->state.last_vs_params.first_instance) {
3862
return;
3863
}
3864
3865
struct tu_cs cs;
3866
VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
3867
if (result != VK_SUCCESS) {
3868
cmd->record_result = result;
3869
return;
3870
}
3871
3872
tu_cs_emit_regs(&cs,
3873
A6XX_VFD_INDEX_OFFSET(vertex_offset),
3874
A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
3875
3876
if (offset) {
3877
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3878
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3879
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3880
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3881
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
3882
CP_LOAD_STATE6_0_NUM_UNIT(1));
3883
tu_cs_emit(&cs, 0);
3884
tu_cs_emit(&cs, 0);
3885
3886
tu_cs_emit(&cs, 0);
3887
tu_cs_emit(&cs, vertex_offset);
3888
tu_cs_emit(&cs, first_instance);
3889
tu_cs_emit(&cs, 0);
3890
}
3891
3892
cmd->state.last_vs_params.params_offset = offset;
3893
cmd->state.last_vs_params.vertex_offset = vertex_offset;
3894
cmd->state.last_vs_params.first_instance = first_instance;
3895
3896
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
3897
cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
3898
3899
cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
3900
}
3901
3902
VKAPI_ATTR void VKAPI_CALL
3903
tu_CmdDraw(VkCommandBuffer commandBuffer,
3904
uint32_t vertexCount,
3905
uint32_t instanceCount,
3906
uint32_t firstVertex,
3907
uint32_t firstInstance)
3908
{
3909
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3910
struct tu_cs *cs = &cmd->draw_cs;
3911
3912
tu6_emit_vs_params(cmd, firstVertex, firstInstance);
3913
3914
tu6_draw_common(cmd, cs, false, vertexCount);
3915
3916
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
3917
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3918
tu_cs_emit(cs, instanceCount);
3919
tu_cs_emit(cs, vertexCount);
3920
}
3921
3922
VKAPI_ATTR void VKAPI_CALL
3923
tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3924
uint32_t indexCount,
3925
uint32_t instanceCount,
3926
uint32_t firstIndex,
3927
int32_t vertexOffset,
3928
uint32_t firstInstance)
3929
{
3930
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3931
struct tu_cs *cs = &cmd->draw_cs;
3932
3933
tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
3934
3935
tu6_draw_common(cmd, cs, true, indexCount);
3936
3937
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
3938
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
3939
tu_cs_emit(cs, instanceCount);
3940
tu_cs_emit(cs, indexCount);
3941
tu_cs_emit(cs, firstIndex);
3942
tu_cs_emit_qw(cs, cmd->state.index_va);
3943
tu_cs_emit(cs, cmd->state.max_index_count);
3944
}
3945
3946
/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
3947
* do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
3948
* pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
3949
* with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
3950
* before draw opcodes that don't need it.
3951
*/
3952
static void
3953
draw_wfm(struct tu_cmd_buffer *cmd)
3954
{
3955
cmd->state.renderpass_cache.flush_bits |=
3956
cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
3957
cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
3958
}
3959
3960
VKAPI_ATTR void VKAPI_CALL
3961
tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3962
VkBuffer _buffer,
3963
VkDeviceSize offset,
3964
uint32_t drawCount,
3965
uint32_t stride)
3966
{
3967
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3968
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3969
struct tu_cs *cs = &cmd->draw_cs;
3970
3971
tu6_emit_empty_vs_params(cmd);
3972
3973
if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
3974
draw_wfm(cmd);
3975
3976
tu6_draw_common(cmd, cs, false, 0);
3977
3978
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
3979
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
3980
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
3981
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
3982
tu_cs_emit(cs, drawCount);
3983
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
3984
tu_cs_emit(cs, stride);
3985
}
3986
3987
VKAPI_ATTR void VKAPI_CALL
3988
tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3989
VkBuffer _buffer,
3990
VkDeviceSize offset,
3991
uint32_t drawCount,
3992
uint32_t stride)
3993
{
3994
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3995
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
3996
struct tu_cs *cs = &cmd->draw_cs;
3997
3998
tu6_emit_empty_vs_params(cmd);
3999
4000
if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
4001
draw_wfm(cmd);
4002
4003
tu6_draw_common(cmd, cs, true, 0);
4004
4005
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
4006
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
4007
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
4008
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4009
tu_cs_emit(cs, drawCount);
4010
tu_cs_emit_qw(cs, cmd->state.index_va);
4011
tu_cs_emit(cs, cmd->state.max_index_count);
4012
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4013
tu_cs_emit(cs, stride);
4014
}
4015
4016
VKAPI_ATTR void VKAPI_CALL
4017
tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4018
VkBuffer _buffer,
4019
VkDeviceSize offset,
4020
VkBuffer countBuffer,
4021
VkDeviceSize countBufferOffset,
4022
uint32_t drawCount,
4023
uint32_t stride)
4024
{
4025
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4026
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4027
TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
4028
struct tu_cs *cs = &cmd->draw_cs;
4029
4030
tu6_emit_empty_vs_params(cmd);
4031
4032
/* It turns out that the firmware we have for a650 only partially fixed the
4033
* problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
4034
* before reading indirect parameters. It waits for WFI's before reading
4035
* the draw parameters, but after reading the indirect count :(.
4036
*/
4037
draw_wfm(cmd);
4038
4039
tu6_draw_common(cmd, cs, false, 0);
4040
4041
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
4042
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
4043
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
4044
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4045
tu_cs_emit(cs, drawCount);
4046
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4047
tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
4048
tu_cs_emit(cs, stride);
4049
}
4050
4051
VKAPI_ATTR void VKAPI_CALL
4052
tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4053
VkBuffer _buffer,
4054
VkDeviceSize offset,
4055
VkBuffer countBuffer,
4056
VkDeviceSize countBufferOffset,
4057
uint32_t drawCount,
4058
uint32_t stride)
4059
{
4060
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4061
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4062
TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
4063
struct tu_cs *cs = &cmd->draw_cs;
4064
4065
tu6_emit_empty_vs_params(cmd);
4066
4067
draw_wfm(cmd);
4068
4069
tu6_draw_common(cmd, cs, true, 0);
4070
4071
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
4072
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
4073
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
4074
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4075
tu_cs_emit(cs, drawCount);
4076
tu_cs_emit_qw(cs, cmd->state.index_va);
4077
tu_cs_emit(cs, cmd->state.max_index_count);
4078
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4079
tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
4080
tu_cs_emit(cs, stride);
4081
}
4082
4083
VKAPI_ATTR void VKAPI_CALL
4084
tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4085
uint32_t instanceCount,
4086
uint32_t firstInstance,
4087
VkBuffer _counterBuffer,
4088
VkDeviceSize counterBufferOffset,
4089
uint32_t counterOffset,
4090
uint32_t vertexStride)
4091
{
4092
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4093
TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
4094
struct tu_cs *cs = &cmd->draw_cs;
4095
4096
/* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
4097
* Plus, for the common case where the counter buffer is written by
4098
* vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
4099
* complete which means we need a WAIT_FOR_ME anyway.
4100
*/
4101
draw_wfm(cmd);
4102
4103
tu6_emit_vs_params(cmd, 0, firstInstance);
4104
4105
tu6_draw_common(cmd, cs, false, 0);
4106
4107
tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
4108
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
4109
tu_cs_emit(cs, instanceCount);
4110
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);
4111
tu_cs_emit(cs, counterOffset);
4112
tu_cs_emit(cs, vertexStride);
4113
}
4114
4115
struct tu_dispatch_info
4116
{
4117
/**
4118
* Determine the layout of the grid (in block units) to be used.
4119
*/
4120
uint32_t blocks[3];
4121
4122
/**
4123
* A starting offset for the grid. If unaligned is set, the offset
4124
* must still be aligned.
4125
*/
4126
uint32_t offsets[3];
4127
/**
4128
* Whether it's an unaligned compute dispatch.
4129
*/
4130
bool unaligned;
4131
4132
/**
4133
* Indirect compute parameters resource.
4134
*/
4135
struct tu_buffer *indirect;
4136
uint64_t indirect_offset;
4137
};
4138
4139
static void
4140
tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
4141
struct tu_cs *cs, struct tu_pipeline *pipeline,
4142
const struct tu_dispatch_info *info)
4143
{
4144
gl_shader_stage type = MESA_SHADER_COMPUTE;
4145
const struct tu_program_descriptor_linkage *link =
4146
&pipeline->program.link[type];
4147
const struct ir3_const_state *const_state = &link->const_state;
4148
uint32_t offset = const_state->offsets.driver_param;
4149
unsigned subgroup_size = pipeline->compute.subgroup_size;
4150
unsigned subgroup_shift = util_logbase2(subgroup_size);
4151
4152
if (link->constlen <= offset)
4153
return;
4154
4155
uint32_t num_consts = MIN2(const_state->num_driver_params,
4156
(link->constlen - offset) * 4);
4157
4158
if (!info->indirect) {
4159
uint32_t driver_params[12] = {
4160
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
4161
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
4162
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
4163
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
4164
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
4165
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
4166
[IR3_DP_SUBGROUP_SIZE] = subgroup_size,
4167
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
4168
};
4169
4170
assert(num_consts <= ARRAY_SIZE(driver_params));
4171
4172
/* push constants */
4173
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
4174
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4175
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4176
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
4177
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4178
CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
4179
tu_cs_emit(cs, 0);
4180
tu_cs_emit(cs, 0);
4181
uint32_t i;
4182
for (i = 0; i < num_consts; i++)
4183
tu_cs_emit(cs, driver_params[i]);
4184
} else if (!(info->indirect_offset & 0xf)) {
4185
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
4186
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4187
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4188
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
4189
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4190
CP_LOAD_STATE6_0_NUM_UNIT(1));
4191
tu_cs_emit_qw(cs, tu_buffer_iova(info->indirect) + info->indirect_offset);
4192
} else {
4193
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
4194
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
4195
*/
4196
4197
uint64_t indirect_iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
4198
4199
for (uint32_t i = 0; i < 3; i++) {
4200
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
4201
tu_cs_emit(cs, 0);
4202
tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i]));
4203
tu_cs_emit_qw(cs, indirect_iova + i * 4);
4204
}
4205
4206
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
4207
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
4208
4209
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
4210
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4211
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4212
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
4213
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4214
CP_LOAD_STATE6_0_NUM_UNIT(1));
4215
tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
4216
}
4217
4218
/* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
4219
* dispatch.
4220
*/
4221
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
4222
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
4223
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
4224
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4225
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
4226
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4227
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
4228
tu_cs_emit_qw(cs, 0);
4229
tu_cs_emit(cs, 0); /* BASE_GROUP_X */
4230
tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
4231
tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
4232
tu_cs_emit(cs, subgroup_size);
4233
if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
4234
assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
4235
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
4236
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
4237
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
4238
tu_cs_emit(cs, subgroup_shift);
4239
}
4240
}
4241
}
4242
4243
static void
4244
tu_dispatch(struct tu_cmd_buffer *cmd,
4245
const struct tu_dispatch_info *info)
4246
{
4247
struct tu_cs *cs = &cmd->cs;
4248
struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
4249
struct tu_descriptor_state *descriptors_state =
4250
&cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
4251
4252
/* TODO: We could probably flush less if we add a compute_flush_bits
4253
* bitfield.
4254
*/
4255
tu_emit_cache_flush(cmd, cs);
4256
4257
/* note: no reason to have this in a separate IB */
4258
tu_cs_emit_state_ib(cs,
4259
tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE));
4260
4261
tu_emit_compute_driver_params(cmd, cs, pipeline, info);
4262
4263
if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)
4264
tu_cs_emit_state_ib(cs, pipeline->load_state);
4265
4266
cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
4267
4268
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
4269
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
4270
4271
const uint32_t *local_size = pipeline->compute.local_size;
4272
const uint32_t *num_groups = info->blocks;
4273
tu_cs_emit_regs(cs,
4274
A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
4275
.localsizex = local_size[0] - 1,
4276
.localsizey = local_size[1] - 1,
4277
.localsizez = local_size[2] - 1),
4278
A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
4279
A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
4280
A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
4281
A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
4282
A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
4283
A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
4284
4285
tu_cs_emit_regs(cs,
4286
A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
4287
A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
4288
A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
4289
4290
if (info->indirect) {
4291
uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
4292
4293
tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
4294
tu_cs_emit(cs, 0x00000000);
4295
tu_cs_emit_qw(cs, iova);
4296
tu_cs_emit(cs,
4297
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
4298
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
4299
A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
4300
} else {
4301
tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
4302
tu_cs_emit(cs, 0x00000000);
4303
tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
4304
tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
4305
tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
4306
}
4307
4308
tu_cs_emit_wfi(cs);
4309
}
4310
4311
VKAPI_ATTR void VKAPI_CALL
4312
tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
4313
uint32_t base_x,
4314
uint32_t base_y,
4315
uint32_t base_z,
4316
uint32_t x,
4317
uint32_t y,
4318
uint32_t z)
4319
{
4320
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4321
struct tu_dispatch_info info = {};
4322
4323
info.blocks[0] = x;
4324
info.blocks[1] = y;
4325
info.blocks[2] = z;
4326
4327
info.offsets[0] = base_x;
4328
info.offsets[1] = base_y;
4329
info.offsets[2] = base_z;
4330
tu_dispatch(cmd_buffer, &info);
4331
}
4332
4333
VKAPI_ATTR void VKAPI_CALL
4334
tu_CmdDispatch(VkCommandBuffer commandBuffer,
4335
uint32_t x,
4336
uint32_t y,
4337
uint32_t z)
4338
{
4339
tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4340
}
4341
4342
VKAPI_ATTR void VKAPI_CALL
4343
tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4344
VkBuffer _buffer,
4345
VkDeviceSize offset)
4346
{
4347
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4348
TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
4349
struct tu_dispatch_info info = {};
4350
4351
info.indirect = buffer;
4352
info.indirect_offset = offset;
4353
4354
tu_dispatch(cmd_buffer, &info);
4355
}
4356
4357
VKAPI_ATTR void VKAPI_CALL
4358
tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4359
const VkSubpassEndInfoKHR *pSubpassEndInfo)
4360
{
4361
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4362
4363
tu_cs_end(&cmd_buffer->draw_cs);
4364
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4365
4366
if (use_sysmem_rendering(cmd_buffer))
4367
tu_cmd_render_sysmem(cmd_buffer);
4368
else
4369
tu_cmd_render_tiles(cmd_buffer);
4370
4371
/* outside of renderpasses we assume all draw states are disabled
4372
* we can do this in the main cs because no resolve/store commands
4373
* should use a draw command (TODO: this will change if unaligned
4374
* GMEM stores are supported)
4375
*/
4376
tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
4377
4378
/* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4379
rendered */
4380
tu_cs_discard_entries(&cmd_buffer->draw_cs);
4381
tu_cs_begin(&cmd_buffer->draw_cs);
4382
tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4383
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4384
4385
cmd_buffer->state.cache.pending_flush_bits |=
4386
cmd_buffer->state.renderpass_cache.pending_flush_bits;
4387
tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
4388
4389
cmd_buffer->state.pass = NULL;
4390
cmd_buffer->state.subpass = NULL;
4391
cmd_buffer->state.framebuffer = NULL;
4392
cmd_buffer->state.has_tess = false;
4393
cmd_buffer->state.has_subpass_predication = false;
4394
4395
/* LRZ is not valid next time we use it */
4396
cmd_buffer->state.lrz.valid = false;
4397
cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
4398
}
4399
4400
struct tu_barrier_info
4401
{
4402
uint32_t eventCount;
4403
const VkEvent *pEvents;
4404
VkPipelineStageFlags srcStageMask;
4405
};
4406
4407
static void
4408
tu_barrier(struct tu_cmd_buffer *cmd,
4409
uint32_t memoryBarrierCount,
4410
const VkMemoryBarrier *pMemoryBarriers,
4411
uint32_t bufferMemoryBarrierCount,
4412
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4413
uint32_t imageMemoryBarrierCount,
4414
const VkImageMemoryBarrier *pImageMemoryBarriers,
4415
const struct tu_barrier_info *info)
4416
{
4417
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4418
VkAccessFlags srcAccessMask = 0;
4419
VkAccessFlags dstAccessMask = 0;
4420
4421
for (uint32_t i = 0; i < memoryBarrierCount; i++) {
4422
srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
4423
dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
4424
}
4425
4426
for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
4427
srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
4428
dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
4429
}
4430
4431
enum tu_cmd_access_mask src_flags = 0;
4432
enum tu_cmd_access_mask dst_flags = 0;
4433
4434
for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4435
VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
4436
if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
4437
/* The underlying memory for this image may have been used earlier
4438
* within the same queue submission for a different image, which
4439
* means that there may be old, stale cache entries which are in the
4440
* "wrong" location, which could cause problems later after writing
4441
* to the image. We don't want these entries being flushed later and
4442
* overwriting the actual image, so we need to flush the CCU.
4443
*/
4444
src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4445
}
4446
srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
4447
dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
4448
}
4449
4450
/* Inside a renderpass, we don't know yet whether we'll be using sysmem
4451
* so we have to use the sysmem flushes.
4452
*/
4453
bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
4454
!cmd->state.pass;
4455
src_flags |= vk2tu_access(srcAccessMask, gmem);
4456
dst_flags |= vk2tu_access(dstAccessMask, gmem);
4457
4458
struct tu_cache_state *cache =
4459
cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
4460
tu_flush_for_access(cache, src_flags, dst_flags);
4461
4462
for (uint32_t i = 0; i < info->eventCount; i++) {
4463
TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
4464
4465
tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4466
tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4467
CP_WAIT_REG_MEM_0_POLL_MEMORY);
4468
tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4469
tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4470
tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4471
tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4472
}
4473
}
4474
4475
VKAPI_ATTR void VKAPI_CALL
4476
tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4477
VkPipelineStageFlags srcStageMask,
4478
VkPipelineStageFlags dstStageMask,
4479
VkDependencyFlags dependencyFlags,
4480
uint32_t memoryBarrierCount,
4481
const VkMemoryBarrier *pMemoryBarriers,
4482
uint32_t bufferMemoryBarrierCount,
4483
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4484
uint32_t imageMemoryBarrierCount,
4485
const VkImageMemoryBarrier *pImageMemoryBarriers)
4486
{
4487
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4488
struct tu_barrier_info info;
4489
4490
info.eventCount = 0;
4491
info.pEvents = NULL;
4492
info.srcStageMask = srcStageMask;
4493
4494
tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4495
bufferMemoryBarrierCount, pBufferMemoryBarriers,
4496
imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4497
}
4498
4499
static void
4500
write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
4501
VkPipelineStageFlags stageMask, unsigned value)
4502
{
4503
struct tu_cs *cs = &cmd->cs;
4504
4505
/* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
4506
assert(!cmd->state.pass);
4507
4508
tu_emit_cache_flush(cmd, cs);
4509
4510
/* Flags that only require a top-of-pipe event. DrawIndirect parameters are
4511
* read by the CP, so the draw indirect stage counts as top-of-pipe too.
4512
*/
4513
VkPipelineStageFlags top_of_pipe_flags =
4514
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
4515
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
4516
4517
if (!(stageMask & ~top_of_pipe_flags)) {
4518
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4519
tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4520
tu_cs_emit(cs, value);
4521
} else {
4522
/* Use a RB_DONE_TS event to wait for everything to complete. */
4523
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
4524
tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
4525
tu_cs_emit_qw(cs, event->bo.iova);
4526
tu_cs_emit(cs, value);
4527
}
4528
}
4529
4530
VKAPI_ATTR void VKAPI_CALL
4531
tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4532
VkEvent _event,
4533
VkPipelineStageFlags stageMask)
4534
{
4535
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4536
TU_FROM_HANDLE(tu_event, event, _event);
4537
4538
write_event(cmd, event, stageMask, 1);
4539
}
4540
4541
VKAPI_ATTR void VKAPI_CALL
4542
tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4543
VkEvent _event,
4544
VkPipelineStageFlags stageMask)
4545
{
4546
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4547
TU_FROM_HANDLE(tu_event, event, _event);
4548
4549
write_event(cmd, event, stageMask, 0);
4550
}
4551
4552
VKAPI_ATTR void VKAPI_CALL
4553
tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4554
uint32_t eventCount,
4555
const VkEvent *pEvents,
4556
VkPipelineStageFlags srcStageMask,
4557
VkPipelineStageFlags dstStageMask,
4558
uint32_t memoryBarrierCount,
4559
const VkMemoryBarrier *pMemoryBarriers,
4560
uint32_t bufferMemoryBarrierCount,
4561
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4562
uint32_t imageMemoryBarrierCount,
4563
const VkImageMemoryBarrier *pImageMemoryBarriers)
4564
{
4565
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4566
struct tu_barrier_info info;
4567
4568
info.eventCount = eventCount;
4569
info.pEvents = pEvents;
4570
info.srcStageMask = 0;
4571
4572
tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
4573
bufferMemoryBarrierCount, pBufferMemoryBarriers,
4574
imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4575
}
4576
4577
VKAPI_ATTR void VKAPI_CALL
4578
tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4579
{
4580
/* No-op */
4581
}
4582
4583
4584
VKAPI_ATTR void VKAPI_CALL
4585
tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4586
const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4587
{
4588
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4589
4590
cmd->state.predication_active = true;
4591
if (cmd->state.pass)
4592
cmd->state.has_subpass_predication = true;
4593
4594
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4595
4596
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
4597
tu_cs_emit(cs, 1);
4598
4599
/* Wait for any writes to the predicate to land */
4600
if (cmd->state.pass)
4601
tu_emit_cache_flush_renderpass(cmd, cs);
4602
else
4603
tu_emit_cache_flush(cmd, cs);
4604
4605
TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
4606
uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset;
4607
4608
/* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
4609
* mandates 32-bit comparisons. Our workaround is to copy the the reference
4610
* value to the low 32-bits of a location where the high 32 bits are known
4611
* to be 0 and then compare that.
4612
*/
4613
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
4614
tu_cs_emit(cs, 0);
4615
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
4616
tu_cs_emit_qw(cs, iova);
4617
4618
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
4619
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
4620
4621
bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4622
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
4623
tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
4624
CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
4625
tu_cs_emit_qw(cs, global_iova(cmd, predicate));
4626
}
4627
4628
VKAPI_ATTR void VKAPI_CALL
4629
tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4630
{
4631
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4632
4633
cmd->state.predication_active = false;
4634
4635
struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4636
4637
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
4638
tu_cs_emit(cs, 0);
4639
}
4640
4641
4642