Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
4561 views
1
/*
2
* Copyright © 2008 Jérôme Glisse
3
* Copyright © 2010 Marek Olšák <[email protected]>
4
* Copyright © 2015 Advanced Micro Devices, Inc.
5
* All Rights Reserved.
6
*
7
* Permission is hereby granted, free of charge, to any person obtaining
8
* a copy of this software and associated documentation files (the
9
* "Software"), to deal in the Software without restriction, including
10
* without limitation the rights to use, copy, modify, merge, publish,
11
* distribute, sub license, and/or sell copies of the Software, and to
12
* permit persons to whom the Software is furnished to do so, subject to
13
* the following conditions:
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
19
* AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
* USE OR OTHER DEALINGS IN THE SOFTWARE.
23
*
24
* The above copyright notice and this permission notice (including the
25
* next paragraph) shall be included in all copies or substantial portions
26
* of the Software.
27
*/
28
29
#include "amdgpu_cs.h"
30
#include "util/os_time.h"
31
#include <inttypes.h>
32
#include <stdio.h>
33
34
#include "amd/common/sid.h"
35
36
/* FENCES */
37
38
static struct pipe_fence_handle *
39
amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
40
unsigned ip_instance, unsigned ring)
41
{
42
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
43
44
fence->reference.count = 1;
45
fence->ws = ctx->ws;
46
fence->ctx = ctx;
47
fence->fence.context = ctx->ctx;
48
fence->fence.ip_type = ip_type;
49
fence->fence.ip_instance = ip_instance;
50
fence->fence.ring = ring;
51
util_queue_fence_init(&fence->submitted);
52
util_queue_fence_reset(&fence->submitted);
53
p_atomic_inc(&ctx->refcount);
54
return (struct pipe_fence_handle *)fence;
55
}
56
57
static struct pipe_fence_handle *
58
amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
59
{
60
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
61
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
62
int r;
63
64
if (!fence)
65
return NULL;
66
67
pipe_reference_init(&fence->reference, 1);
68
fence->ws = ws;
69
70
r = amdgpu_cs_import_syncobj(ws->dev, fd, &fence->syncobj);
71
if (r) {
72
FREE(fence);
73
return NULL;
74
}
75
76
util_queue_fence_init(&fence->submitted);
77
78
assert(amdgpu_fence_is_syncobj(fence));
79
return (struct pipe_fence_handle*)fence;
80
}
81
82
static struct pipe_fence_handle *
83
amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
84
{
85
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
86
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
87
88
if (!fence)
89
return NULL;
90
91
pipe_reference_init(&fence->reference, 1);
92
fence->ws = ws;
93
/* fence->ctx == NULL means that the fence is syncobj-based. */
94
95
/* Convert sync_file into syncobj. */
96
int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj);
97
if (r) {
98
FREE(fence);
99
return NULL;
100
}
101
102
r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
103
if (r) {
104
amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
105
FREE(fence);
106
return NULL;
107
}
108
109
util_queue_fence_init(&fence->submitted);
110
111
return (struct pipe_fence_handle*)fence;
112
}
113
114
static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
115
struct pipe_fence_handle *pfence)
116
{
117
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
118
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
119
120
if (amdgpu_fence_is_syncobj(fence)) {
121
int fd, r;
122
123
/* Convert syncobj into sync_file. */
124
r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
125
return r ? -1 : fd;
126
}
127
128
util_queue_fence_wait(&fence->submitted);
129
130
/* Convert the amdgpu fence into a fence FD. */
131
int fd;
132
if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence,
133
AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD,
134
(uint32_t*)&fd))
135
return -1;
136
137
return fd;
138
}
139
140
static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
141
{
142
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
143
uint32_t syncobj;
144
int fd = -1;
145
146
int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
147
&syncobj);
148
if (r) {
149
return -1;
150
}
151
152
r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd);
153
if (r) {
154
fd = -1;
155
}
156
157
amdgpu_cs_destroy_syncobj(ws->dev, syncobj);
158
return fd;
159
}
160
161
static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
162
uint64_t seq_no,
163
uint64_t *user_fence_cpu_address)
164
{
165
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
166
167
afence->fence.fence = seq_no;
168
afence->user_fence_cpu_address = user_fence_cpu_address;
169
util_queue_fence_signal(&afence->submitted);
170
}
171
172
static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
173
{
174
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
175
176
afence->signalled = true;
177
util_queue_fence_signal(&afence->submitted);
178
}
179
180
bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
181
bool absolute)
182
{
183
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
184
uint32_t expired;
185
int64_t abs_timeout;
186
uint64_t *user_fence_cpu;
187
int r;
188
189
if (afence->signalled)
190
return true;
191
192
if (absolute)
193
abs_timeout = timeout;
194
else
195
abs_timeout = os_time_get_absolute_timeout(timeout);
196
197
/* Handle syncobjs. */
198
if (amdgpu_fence_is_syncobj(afence)) {
199
if (abs_timeout == OS_TIMEOUT_INFINITE)
200
abs_timeout = INT64_MAX;
201
202
if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1,
203
abs_timeout, 0, NULL))
204
return false;
205
206
afence->signalled = true;
207
return true;
208
}
209
210
/* The fence might not have a number assigned if its IB is being
211
* submitted in the other thread right now. Wait until the submission
212
* is done. */
213
if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
214
return false;
215
216
user_fence_cpu = afence->user_fence_cpu_address;
217
if (user_fence_cpu) {
218
if (*user_fence_cpu >= afence->fence.fence) {
219
afence->signalled = true;
220
return true;
221
}
222
223
/* No timeout, just query: no need for the ioctl. */
224
if (!absolute && !timeout)
225
return false;
226
}
227
228
/* Now use the libdrm query. */
229
r = amdgpu_cs_query_fence_status(&afence->fence,
230
abs_timeout,
231
AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
232
&expired);
233
if (r) {
234
fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n");
235
return false;
236
}
237
238
if (expired) {
239
/* This variable can only transition from false to true, so it doesn't
240
* matter if threads race for it. */
241
afence->signalled = true;
242
return true;
243
}
244
return false;
245
}
246
247
static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
248
struct pipe_fence_handle *fence,
249
uint64_t timeout)
250
{
251
return amdgpu_fence_wait(fence, timeout, false);
252
}
253
254
static struct pipe_fence_handle *
255
amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
256
{
257
struct amdgpu_cs *cs = amdgpu_cs(rcs);
258
struct pipe_fence_handle *fence = NULL;
259
260
if (cs->noop)
261
return NULL;
262
263
if (cs->next_fence) {
264
amdgpu_fence_reference(&fence, cs->next_fence);
265
return fence;
266
}
267
268
fence = amdgpu_fence_create(cs->ctx,
269
cs->csc->ib[IB_MAIN].ip_type,
270
cs->csc->ib[IB_MAIN].ip_instance,
271
cs->csc->ib[IB_MAIN].ring);
272
if (!fence)
273
return NULL;
274
275
amdgpu_fence_reference(&cs->next_fence, fence);
276
return fence;
277
}
278
279
/* CONTEXTS */
280
281
static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws)
282
{
283
struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
284
int r;
285
struct amdgpu_bo_alloc_request alloc_buffer = {};
286
amdgpu_bo_handle buf_handle;
287
288
if (!ctx)
289
return NULL;
290
291
ctx->ws = amdgpu_winsys(ws);
292
ctx->refcount = 1;
293
ctx->initial_num_total_rejected_cs = ctx->ws->num_total_rejected_cs;
294
295
r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx);
296
if (r) {
297
fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r);
298
goto error_create;
299
}
300
301
alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
302
alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
303
alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
304
305
r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
306
if (r) {
307
fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
308
goto error_user_fence_alloc;
309
}
310
311
r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
312
if (r) {
313
fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
314
goto error_user_fence_map;
315
}
316
317
memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
318
ctx->user_fence_bo = buf_handle;
319
320
return (struct radeon_winsys_ctx*)ctx;
321
322
error_user_fence_map:
323
amdgpu_bo_free(buf_handle);
324
error_user_fence_alloc:
325
amdgpu_cs_ctx_free(ctx->ctx);
326
error_create:
327
FREE(ctx);
328
return NULL;
329
}
330
331
static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
332
{
333
amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
334
}
335
336
static enum pipe_reset_status
337
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
338
bool *needs_reset)
339
{
340
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
341
int r;
342
343
if (needs_reset)
344
*needs_reset = false;
345
346
/* Return a failure due to a GPU hang. */
347
if (ctx->ws->info.drm_minor >= 24) {
348
uint64_t flags;
349
350
if (full_reset_only &&
351
ctx->initial_num_total_rejected_cs == ctx->ws->num_total_rejected_cs) {
352
/* If the caller is only interested in full reset (= wants to ignore soft
353
* recoveries), we can use the rejected cs count as a quick first check.
354
*/
355
return PIPE_NO_RESET;
356
}
357
358
r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
359
if (r) {
360
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
361
return PIPE_NO_RESET;
362
}
363
364
if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
365
if (needs_reset)
366
*needs_reset = flags & AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
367
if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY)
368
return PIPE_GUILTY_CONTEXT_RESET;
369
else
370
return PIPE_INNOCENT_CONTEXT_RESET;
371
}
372
} else {
373
uint32_t result, hangs;
374
375
r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
376
if (r) {
377
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
378
return PIPE_NO_RESET;
379
}
380
381
if (needs_reset)
382
*needs_reset = true;
383
switch (result) {
384
case AMDGPU_CTX_GUILTY_RESET:
385
return PIPE_GUILTY_CONTEXT_RESET;
386
case AMDGPU_CTX_INNOCENT_RESET:
387
return PIPE_INNOCENT_CONTEXT_RESET;
388
case AMDGPU_CTX_UNKNOWN_RESET:
389
return PIPE_UNKNOWN_CONTEXT_RESET;
390
}
391
}
392
393
/* Return a failure due to a rejected command submission. */
394
if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) {
395
if (needs_reset)
396
*needs_reset = true;
397
return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET :
398
PIPE_INNOCENT_CONTEXT_RESET;
399
}
400
if (needs_reset)
401
*needs_reset = false;
402
return PIPE_NO_RESET;
403
}
404
405
/* COMMAND SUBMISSION */
406
407
static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
408
{
409
return cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD &&
410
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
411
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC &&
412
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
413
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC &&
414
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG;
415
}
416
417
static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
418
{
419
if (cs->has_chaining)
420
return 4; /* for chaining */
421
422
return 0;
423
}
424
425
static int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
426
struct amdgpu_cs_buffer *buffers, unsigned num_buffers)
427
{
428
unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
429
int i = cs->buffer_indices_hashlist[hash];
430
431
/* not found or found */
432
if (i < 0 || (i < num_buffers && buffers[i].bo == bo))
433
return i;
434
435
/* Hash collision, look for the BO in the list of buffers linearly. */
436
for (int i = num_buffers - 1; i >= 0; i--) {
437
if (buffers[i].bo == bo) {
438
/* Put this buffer in the hash list.
439
* This will prevent additional hash collisions if there are
440
* several consecutive lookup_buffer calls for the same buffer.
441
*
442
* Example: Assuming buffers A,B,C collide in the hash list,
443
* the following sequence of buffers:
444
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
445
* will collide here: ^ and here: ^,
446
* meaning that we should get very few collisions in the end. */
447
cs->buffer_indices_hashlist[hash] = i & 0x7fff;
448
return i;
449
}
450
}
451
return -1;
452
}
453
454
int amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
455
{
456
struct amdgpu_cs_buffer *buffers;
457
int num_buffers;
458
459
if (bo->bo) {
460
buffers = cs->real_buffers;
461
num_buffers = cs->num_real_buffers;
462
} else if (!(bo->base.usage & RADEON_FLAG_SPARSE)) {
463
buffers = cs->slab_buffers;
464
num_buffers = cs->num_slab_buffers;
465
} else {
466
buffers = cs->sparse_buffers;
467
num_buffers = cs->num_sparse_buffers;
468
}
469
470
return amdgpu_lookup_buffer(cs, bo, buffers, num_buffers);
471
}
472
473
static int
474
amdgpu_do_add_real_buffer(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs,
475
struct amdgpu_winsys_bo *bo)
476
{
477
struct amdgpu_cs_buffer *buffer;
478
int idx;
479
480
/* New buffer, check if the backing array is large enough. */
481
if (cs->num_real_buffers >= cs->max_real_buffers) {
482
unsigned new_max =
483
MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3));
484
struct amdgpu_cs_buffer *new_buffers;
485
486
new_buffers = MALLOC(new_max * sizeof(*new_buffers));
487
488
if (!new_buffers) {
489
fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
490
FREE(new_buffers);
491
return -1;
492
}
493
494
memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers));
495
496
FREE(cs->real_buffers);
497
498
cs->max_real_buffers = new_max;
499
cs->real_buffers = new_buffers;
500
}
501
502
idx = cs->num_real_buffers;
503
buffer = &cs->real_buffers[idx];
504
505
memset(buffer, 0, sizeof(*buffer));
506
amdgpu_winsys_bo_reference(ws, &buffer->bo, bo);
507
cs->num_real_buffers++;
508
509
return idx;
510
}
511
512
static int
513
amdgpu_lookup_or_add_real_buffer(struct radeon_cmdbuf *rcs, struct amdgpu_cs *acs,
514
struct amdgpu_winsys_bo *bo)
515
{
516
struct amdgpu_cs_context *cs = acs->csc;
517
unsigned hash;
518
int idx = amdgpu_lookup_buffer(cs, bo, cs->real_buffers, cs->num_real_buffers);
519
520
if (idx >= 0)
521
return idx;
522
523
idx = amdgpu_do_add_real_buffer(acs->ws, cs, bo);
524
525
hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
526
cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
527
528
if (bo->base.placement & RADEON_DOMAIN_VRAM)
529
rcs->used_vram_kb += bo->base.size / 1024;
530
else if (bo->base.placement & RADEON_DOMAIN_GTT)
531
rcs->used_gart_kb += bo->base.size / 1024;
532
533
return idx;
534
}
535
536
static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_winsys *ws,
537
struct radeon_cmdbuf *rcs,
538
struct amdgpu_cs *acs,
539
struct amdgpu_winsys_bo *bo)
540
{
541
struct amdgpu_cs_context *cs = acs->csc;
542
struct amdgpu_cs_buffer *buffer;
543
unsigned hash;
544
int idx = amdgpu_lookup_buffer(cs, bo, cs->slab_buffers, cs->num_slab_buffers);
545
int real_idx;
546
547
if (idx >= 0)
548
return idx;
549
550
real_idx = amdgpu_lookup_or_add_real_buffer(rcs, acs, bo->u.slab.real);
551
if (real_idx < 0)
552
return -1;
553
554
/* New buffer, check if the backing array is large enough. */
555
if (cs->num_slab_buffers >= cs->max_slab_buffers) {
556
unsigned new_max =
557
MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3));
558
struct amdgpu_cs_buffer *new_buffers;
559
560
new_buffers = REALLOC(cs->slab_buffers,
561
cs->max_slab_buffers * sizeof(*new_buffers),
562
new_max * sizeof(*new_buffers));
563
if (!new_buffers) {
564
fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n");
565
return -1;
566
}
567
568
cs->max_slab_buffers = new_max;
569
cs->slab_buffers = new_buffers;
570
}
571
572
idx = cs->num_slab_buffers;
573
buffer = &cs->slab_buffers[idx];
574
575
memset(buffer, 0, sizeof(*buffer));
576
amdgpu_winsys_bo_reference(ws, &buffer->bo, bo);
577
buffer->u.slab.real_idx = real_idx;
578
cs->num_slab_buffers++;
579
580
hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
581
cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
582
583
return idx;
584
}
585
586
static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_winsys *ws,
587
struct radeon_cmdbuf *rcs,
588
struct amdgpu_cs *acs,
589
struct amdgpu_winsys_bo *bo)
590
{
591
struct amdgpu_cs_context *cs = acs->csc;
592
struct amdgpu_cs_buffer *buffer;
593
unsigned hash;
594
int idx = amdgpu_lookup_buffer(cs, bo, cs->sparse_buffers, cs->num_sparse_buffers);
595
596
if (idx >= 0)
597
return idx;
598
599
/* New buffer, check if the backing array is large enough. */
600
if (cs->num_sparse_buffers >= cs->max_sparse_buffers) {
601
unsigned new_max =
602
MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3));
603
struct amdgpu_cs_buffer *new_buffers;
604
605
new_buffers = REALLOC(cs->sparse_buffers,
606
cs->max_sparse_buffers * sizeof(*new_buffers),
607
new_max * sizeof(*new_buffers));
608
if (!new_buffers) {
609
fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n");
610
return -1;
611
}
612
613
cs->max_sparse_buffers = new_max;
614
cs->sparse_buffers = new_buffers;
615
}
616
617
idx = cs->num_sparse_buffers;
618
buffer = &cs->sparse_buffers[idx];
619
620
memset(buffer, 0, sizeof(*buffer));
621
amdgpu_winsys_bo_reference(ws, &buffer->bo, bo);
622
cs->num_sparse_buffers++;
623
624
hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
625
cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
626
627
/* We delay adding the backing buffers until we really have to. However,
628
* we cannot delay accounting for memory use.
629
*/
630
simple_mtx_lock(&bo->lock);
631
632
list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
633
if (bo->base.placement & RADEON_DOMAIN_VRAM)
634
rcs->used_vram_kb += backing->bo->base.size / 1024;
635
else if (bo->base.placement & RADEON_DOMAIN_GTT)
636
rcs->used_gart_kb += backing->bo->base.size / 1024;
637
}
638
639
simple_mtx_unlock(&bo->lock);
640
641
return idx;
642
}
643
644
static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
645
struct pb_buffer *buf,
646
enum radeon_bo_usage usage,
647
enum radeon_bo_domain domains,
648
enum radeon_bo_priority priority)
649
{
650
/* Don't use the "domains" parameter. Amdgpu doesn't support changing
651
* the buffer placement during command submission.
652
*/
653
struct amdgpu_cs *acs = amdgpu_cs(rcs);
654
struct amdgpu_cs_context *cs = acs->csc;
655
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
656
struct amdgpu_cs_buffer *buffer;
657
int index;
658
659
/* Fast exit for no-op calls.
660
* This is very effective with suballocators and linear uploaders that
661
* are outside of the winsys.
662
*/
663
if (bo == cs->last_added_bo &&
664
(usage & cs->last_added_bo_usage) == usage &&
665
(1u << priority) & cs->last_added_bo_priority_usage)
666
return cs->last_added_bo_index;
667
668
if (!(bo->base.usage & RADEON_FLAG_SPARSE)) {
669
if (!bo->bo) {
670
index = amdgpu_lookup_or_add_slab_buffer(acs->ws, rcs, acs, bo);
671
if (index < 0)
672
return 0;
673
674
buffer = &cs->slab_buffers[index];
675
buffer->usage |= usage;
676
677
usage &= ~RADEON_USAGE_SYNCHRONIZED;
678
index = buffer->u.slab.real_idx;
679
} else {
680
index = amdgpu_lookup_or_add_real_buffer(rcs, acs, bo);
681
if (index < 0)
682
return 0;
683
}
684
685
buffer = &cs->real_buffers[index];
686
} else {
687
index = amdgpu_lookup_or_add_sparse_buffer(acs->ws, rcs, acs, bo);
688
if (index < 0)
689
return 0;
690
691
buffer = &cs->sparse_buffers[index];
692
}
693
694
buffer->u.real.priority_usage |= 1u << priority;
695
buffer->usage |= usage;
696
697
cs->last_added_bo = bo;
698
cs->last_added_bo_index = index;
699
cs->last_added_bo_usage = buffer->usage;
700
cs->last_added_bo_priority_usage = buffer->u.real.priority_usage;
701
return index;
702
}
703
704
static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws,
705
struct amdgpu_ib *ib,
706
struct amdgpu_cs *cs)
707
{
708
struct pb_buffer *pb;
709
uint8_t *mapped;
710
unsigned buffer_size;
711
712
/* Always create a buffer that is at least as large as the maximum seen IB
713
* size, aligned to a power of two (and multiplied by 4 to reduce internal
714
* fragmentation if chaining is not available). Limit to 512k dwords, which
715
* is the largest power of two that fits into the size field of the
716
* INDIRECT_BUFFER packet.
717
*/
718
if (cs->has_chaining)
719
buffer_size = 4 * util_next_power_of_two(ib->max_ib_size);
720
else
721
buffer_size = 4 * util_next_power_of_two(4 * ib->max_ib_size);
722
723
const unsigned min_size = MAX2(ib->max_check_space_size, 8 * 1024 * 4);
724
const unsigned max_size = 512 * 1024 * 4;
725
726
buffer_size = MIN2(buffer_size, max_size);
727
buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
728
729
enum radeon_bo_domain domain;
730
unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING;
731
732
if (cs->ring_type == RING_GFX ||
733
cs->ring_type == RING_COMPUTE ||
734
cs->ring_type == RING_DMA) {
735
domain = ws->info.smart_access_memory ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
736
flags |= RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC;
737
} else {
738
/* UVD/VCE */
739
/* TODO: validate that UVD/VCE don't read from IBs and enable WC or even VRAM. */
740
domain = RADEON_DOMAIN_GTT;
741
}
742
743
pb = amdgpu_bo_create(ws, buffer_size,
744
ws->info.gart_page_size,
745
domain, flags);
746
if (!pb)
747
return false;
748
749
mapped = amdgpu_bo_map(&ws->dummy_ws.base, pb, NULL, PIPE_MAP_WRITE);
750
if (!mapped) {
751
radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
752
return false;
753
}
754
755
radeon_bo_reference(&ws->dummy_ws.base, &ib->big_ib_buffer, pb);
756
radeon_bo_reference(&ws->dummy_ws.base, &pb, NULL);
757
758
ib->ib_mapped = mapped;
759
ib->used_ib_space = 0;
760
761
return true;
762
}
763
764
static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
765
{
766
/* The maximum IB size including all chained IBs. */
767
switch (ib_type) {
768
case IB_MAIN:
769
/* Smaller submits means the GPU gets busy sooner and there is less
770
* waiting for buffers and fences. Proof:
771
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
772
*/
773
return 20 * 1024;
774
case IB_PARALLEL_COMPUTE:
775
/* Always chain this IB. */
776
return UINT_MAX;
777
default:
778
unreachable("bad ib_type");
779
}
780
}
781
782
static bool amdgpu_get_new_ib(struct amdgpu_winsys *ws,
783
struct radeon_cmdbuf *rcs,
784
struct amdgpu_ib *ib,
785
struct amdgpu_cs *cs)
786
{
787
/* Small IBs are better than big IBs, because the GPU goes idle quicker
788
* and there is less waiting for buffers and fences. Proof:
789
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
790
*/
791
struct drm_amdgpu_cs_chunk_ib *info = &cs->csc->ib[ib->ib_type];
792
/* This is the minimum size of a contiguous IB. */
793
unsigned ib_size = 4 * 1024 * 4;
794
795
/* Always allocate at least the size of the biggest cs_check_space call,
796
* because precisely the last call might have requested this size.
797
*/
798
ib_size = MAX2(ib_size, ib->max_check_space_size);
799
800
if (!cs->has_chaining) {
801
ib_size = MAX2(ib_size,
802
4 * MIN2(util_next_power_of_two(ib->max_ib_size),
803
amdgpu_ib_max_submit_dwords(ib->ib_type)));
804
}
805
806
ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32;
807
808
rcs->prev_dw = 0;
809
rcs->num_prev = 0;
810
rcs->current.cdw = 0;
811
rcs->current.buf = NULL;
812
813
/* Allocate a new buffer for IBs if the current buffer is all used. */
814
if (!ib->big_ib_buffer ||
815
ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
816
if (!amdgpu_ib_new_buffer(ws, ib, cs))
817
return false;
818
}
819
820
info->va_start = amdgpu_winsys_bo(ib->big_ib_buffer)->va + ib->used_ib_space;
821
info->ib_bytes = 0;
822
/* ib_bytes is in dwords and the conversion to bytes will be done before
823
* the CS ioctl. */
824
ib->ptr_ib_size = &info->ib_bytes;
825
ib->ptr_ib_size_inside_ib = false;
826
827
amdgpu_cs_add_buffer(cs->main.rcs, ib->big_ib_buffer,
828
RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
829
830
rcs->current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
831
832
ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
833
rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
834
rcs->gpu_address = info->va_start;
835
return true;
836
}
837
838
static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
839
{
840
if (ib->ptr_ib_size_inside_ib) {
841
*ib->ptr_ib_size = rcs->current.cdw |
842
S_3F2_CHAIN(1) | S_3F2_VALID(1);
843
} else {
844
*ib->ptr_ib_size = rcs->current.cdw;
845
}
846
}
847
848
static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct radeon_cmdbuf *rcs,
849
struct amdgpu_ib *ib)
850
{
851
amdgpu_set_ib_size(rcs, ib);
852
ib->used_ib_space += rcs->current.cdw * 4;
853
ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_alignment);
854
ib->max_ib_size = MAX2(ib->max_ib_size, rcs->prev_dw + rcs->current.cdw);
855
}
856
857
static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
858
struct amdgpu_cs_context *cs,
859
enum ring_type ring_type)
860
{
861
switch (ring_type) {
862
case RING_DMA:
863
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA;
864
break;
865
866
case RING_UVD:
867
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD;
868
break;
869
870
case RING_UVD_ENC:
871
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD_ENC;
872
break;
873
874
case RING_VCE:
875
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE;
876
break;
877
878
case RING_VCN_DEC:
879
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC;
880
break;
881
882
case RING_VCN_ENC:
883
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_ENC;
884
break;
885
886
case RING_VCN_JPEG:
887
cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_JPEG;
888
break;
889
890
case RING_COMPUTE:
891
case RING_GFX:
892
cs->ib[IB_MAIN].ip_type = ring_type == RING_GFX ? AMDGPU_HW_IP_GFX :
893
AMDGPU_HW_IP_COMPUTE;
894
895
/* The kernel shouldn't invalidate L2 and vL1. The proper place for cache
896
* invalidation is the beginning of IBs (the previous commit does that),
897
* because completion of an IB doesn't care about the state of GPU caches,
898
* but the beginning of an IB does. Draw calls from multiple IBs can be
899
* executed in parallel, so draw calls from the current IB can finish after
900
* the next IB starts drawing, and so the cache flush at the end of IB
901
* is always late.
902
*/
903
if (ws->info.drm_minor >= 26)
904
cs->ib[IB_MAIN].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
905
break;
906
907
default:
908
assert(0);
909
}
910
911
cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
912
cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
913
914
cs->last_added_bo = NULL;
915
return true;
916
}
917
918
static void cleanup_fence_list(struct amdgpu_fence_list *fences)
919
{
920
for (unsigned i = 0; i < fences->num; i++)
921
amdgpu_fence_reference(&fences->list[i], NULL);
922
fences->num = 0;
923
}
924
925
static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
926
{
927
unsigned i;
928
929
for (i = 0; i < cs->num_real_buffers; i++) {
930
amdgpu_winsys_bo_reference(ws, &cs->real_buffers[i].bo, NULL);
931
}
932
for (i = 0; i < cs->num_slab_buffers; i++) {
933
amdgpu_winsys_bo_reference(ws, &cs->slab_buffers[i].bo, NULL);
934
}
935
for (i = 0; i < cs->num_sparse_buffers; i++) {
936
amdgpu_winsys_bo_reference(ws, &cs->sparse_buffers[i].bo, NULL);
937
}
938
cleanup_fence_list(&cs->fence_dependencies);
939
cleanup_fence_list(&cs->syncobj_dependencies);
940
cleanup_fence_list(&cs->syncobj_to_signal);
941
cleanup_fence_list(&cs->compute_fence_dependencies);
942
cleanup_fence_list(&cs->compute_start_fence_dependencies);
943
944
cs->num_real_buffers = 0;
945
cs->num_slab_buffers = 0;
946
cs->num_sparse_buffers = 0;
947
amdgpu_fence_reference(&cs->fence, NULL);
948
cs->last_added_bo = NULL;
949
}
950
951
static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs_context *cs)
952
{
953
amdgpu_cs_context_cleanup(ws, cs);
954
FREE(cs->real_buffers);
955
FREE(cs->slab_buffers);
956
FREE(cs->sparse_buffers);
957
FREE(cs->fence_dependencies.list);
958
FREE(cs->syncobj_dependencies.list);
959
FREE(cs->syncobj_to_signal.list);
960
FREE(cs->compute_fence_dependencies.list);
961
FREE(cs->compute_start_fence_dependencies.list);
962
}
963
964
965
static bool
966
amdgpu_cs_create(struct radeon_cmdbuf *rcs,
967
struct radeon_winsys_ctx *rwctx,
968
enum ring_type ring_type,
969
void (*flush)(void *ctx, unsigned flags,
970
struct pipe_fence_handle **fence),
971
void *flush_ctx,
972
bool stop_exec_on_failure)
973
{
974
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
975
struct amdgpu_cs *cs;
976
977
cs = CALLOC_STRUCT(amdgpu_cs);
978
if (!cs) {
979
return false;
980
}
981
982
util_queue_fence_init(&cs->flush_completed);
983
984
cs->ws = ctx->ws;
985
cs->ctx = ctx;
986
cs->flush_cs = flush;
987
cs->flush_data = flush_ctx;
988
cs->ring_type = ring_type;
989
cs->stop_exec_on_failure = stop_exec_on_failure;
990
cs->noop = ctx->ws->noop_cs;
991
cs->has_chaining = ctx->ws->info.chip_class >= GFX7 &&
992
(ring_type == RING_GFX || ring_type == RING_COMPUTE);
993
994
struct amdgpu_cs_fence_info fence_info;
995
fence_info.handle = cs->ctx->user_fence_bo;
996
fence_info.offset = cs->ring_type * 4;
997
amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
998
999
cs->main.ib_type = IB_MAIN;
1000
cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;
1001
1002
if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
1003
FREE(cs);
1004
return false;
1005
}
1006
1007
if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ring_type)) {
1008
amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
1009
FREE(cs);
1010
return false;
1011
}
1012
1013
memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1014
1015
/* Set the first submission context as current. */
1016
cs->csc = &cs->csc1;
1017
cs->cst = &cs->csc2;
1018
1019
/* Assign to both amdgpu_cs_context; only csc will use it. */
1020
cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
1021
cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
1022
1023
cs->main.rcs = rcs;
1024
rcs->priv = cs;
1025
1026
if (!amdgpu_get_new_ib(ctx->ws, rcs, &cs->main, cs)) {
1027
amdgpu_destroy_cs_context(ctx->ws, &cs->csc2);
1028
amdgpu_destroy_cs_context(ctx->ws, &cs->csc1);
1029
FREE(cs);
1030
rcs->priv = NULL;
1031
return false;
1032
}
1033
1034
p_atomic_inc(&ctx->ws->num_cs);
1035
return true;
1036
}
1037
1038
static bool
1039
amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs,
1040
struct radeon_cmdbuf *gfx_cs,
1041
bool uses_gds_ordered_append)
1042
{
1043
struct amdgpu_cs *cs = amdgpu_cs(gfx_cs);
1044
struct amdgpu_winsys *ws = cs->ws;
1045
1046
if (cs->ring_type != RING_GFX)
1047
return false;
1048
1049
/* only one secondary IB can be added */
1050
if (cs->compute_ib.ib_mapped)
1051
return false;
1052
1053
/* Allocate the compute IB. */
1054
if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs))
1055
return false;
1056
1057
if (uses_gds_ordered_append) {
1058
cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
1059
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
1060
cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
1061
AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
1062
}
1063
1064
cs->compute_ib.rcs = compute_cs;
1065
compute_cs->priv = cs;
1066
return true;
1067
}
1068
1069
static bool
1070
amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
1071
unsigned preamble_num_dw)
1072
{
1073
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1074
struct amdgpu_winsys *ws = cs->ws;
1075
struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
1076
unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment);
1077
struct pb_buffer *preamble_bo;
1078
uint32_t *map;
1079
1080
/* Create the preamble IB buffer. */
1081
preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment,
1082
RADEON_DOMAIN_VRAM,
1083
RADEON_FLAG_NO_INTERPROCESS_SHARING |
1084
RADEON_FLAG_GTT_WC |
1085
RADEON_FLAG_READ_ONLY);
1086
if (!preamble_bo)
1087
return false;
1088
1089
map = (uint32_t*)amdgpu_bo_map(&ws->dummy_ws.base, preamble_bo, NULL,
1090
PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
1091
if (!map) {
1092
radeon_bo_reference(&ws->dummy_ws.base, &preamble_bo, NULL);
1093
return false;
1094
}
1095
1096
/* Upload the preamble IB. */
1097
memcpy(map, preamble_ib, preamble_num_dw * 4);
1098
1099
/* Pad the IB. */
1100
uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type];
1101
while (preamble_num_dw & ib_pad_dw_mask)
1102
map[preamble_num_dw++] = PKT3_NOP_PAD;
1103
amdgpu_bo_unmap(&ws->dummy_ws.base, preamble_bo);
1104
1105
for (unsigned i = 0; i < 2; i++) {
1106
csc[i]->ib[IB_PREAMBLE] = csc[i]->ib[IB_MAIN];
1107
csc[i]->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
1108
csc[i]->ib[IB_PREAMBLE].va_start = amdgpu_winsys_bo(preamble_bo)->va;
1109
csc[i]->ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1110
1111
csc[i]->ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1112
}
1113
1114
assert(!cs->preamble_ib_bo);
1115
cs->preamble_ib_bo = preamble_bo;
1116
1117
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
1118
RADEON_PRIO_IB1);
1119
return true;
1120
}
1121
1122
static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1123
{
1124
return true;
1125
}
1126
1127
static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
1128
bool force_chaining)
1129
{
1130
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1131
struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib;
1132
unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
1133
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1134
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1135
uint64_t va;
1136
uint32_t *new_ptr_ib_size;
1137
1138
assert(rcs->current.cdw <= rcs->current.max_dw);
1139
1140
/* 125% of the size for IB epilog. */
1141
unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1142
ib->max_check_space_size = MAX2(ib->max_check_space_size,
1143
safe_byte_size);
1144
1145
/* If force_chaining is true, we can't return. We have to chain. */
1146
if (!force_chaining) {
1147
if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
1148
return false;
1149
1150
ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
1151
1152
if (rcs->current.max_dw - rcs->current.cdw >= dw)
1153
return true;
1154
}
1155
1156
if (!cs->has_chaining) {
1157
assert(!force_chaining);
1158
return false;
1159
}
1160
1161
/* Allocate a new chunk */
1162
if (rcs->num_prev >= rcs->max_prev) {
1163
unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1164
struct radeon_cmdbuf_chunk *new_prev;
1165
1166
new_prev = REALLOC(rcs->prev,
1167
sizeof(*new_prev) * rcs->max_prev,
1168
sizeof(*new_prev) * new_max_prev);
1169
if (!new_prev)
1170
return false;
1171
1172
rcs->prev = new_prev;
1173
rcs->max_prev = new_max_prev;
1174
}
1175
1176
if (!amdgpu_ib_new_buffer(cs->ws, ib, cs))
1177
return false;
1178
1179
assert(ib->used_ib_space == 0);
1180
va = amdgpu_winsys_bo(ib->big_ib_buffer)->va;
1181
1182
/* This space was originally reserved. */
1183
rcs->current.max_dw += cs_epilog_dw;
1184
1185
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1186
uint32_t ib_pad_dw_mask = cs->ws->info.ib_pad_dw_mask[cs->ring_type];
1187
while ((rcs->current.cdw & ib_pad_dw_mask) != ib_pad_dw_mask - 3)
1188
radeon_emit(rcs, PKT3_NOP_PAD);
1189
1190
radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
1191
radeon_emit(rcs, va);
1192
radeon_emit(rcs, va >> 32);
1193
new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1194
assert((rcs->current.cdw & ib_pad_dw_mask) == 0);
1195
1196
assert((rcs->current.cdw & 7) == 0);
1197
assert(rcs->current.cdw <= rcs->current.max_dw);
1198
1199
amdgpu_set_ib_size(rcs, ib);
1200
ib->ptr_ib_size = new_ptr_ib_size;
1201
ib->ptr_ib_size_inside_ib = true;
1202
1203
/* Hook up the new chunk */
1204
rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1205
rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1206
rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1207
rcs->num_prev++;
1208
1209
rcs->prev_dw += rcs->current.cdw;
1210
rcs->current.cdw = 0;
1211
1212
rcs->current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
1213
rcs->current.max_dw = ib->big_ib_buffer->size / 4 - cs_epilog_dw;
1214
rcs->gpu_address = va;
1215
1216
amdgpu_cs_add_buffer(cs->main.rcs, ib->big_ib_buffer,
1217
RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
1218
1219
return true;
1220
}
1221
1222
static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1223
struct radeon_bo_list_item *list)
1224
{
1225
struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1226
int i;
1227
1228
if (list) {
1229
for (i = 0; i < cs->num_real_buffers; i++) {
1230
list[i].bo_size = cs->real_buffers[i].bo->base.size;
1231
list[i].vm_address = cs->real_buffers[i].bo->va;
1232
list[i].priority_usage = cs->real_buffers[i].u.real.priority_usage;
1233
}
1234
}
1235
return cs->num_real_buffers;
1236
}
1237
1238
static void add_fence_to_list(struct amdgpu_fence_list *fences,
1239
struct amdgpu_fence *fence)
1240
{
1241
unsigned idx = fences->num++;
1242
1243
if (idx >= fences->max) {
1244
unsigned size;
1245
const unsigned increment = 8;
1246
1247
fences->max = idx + increment;
1248
size = fences->max * sizeof(fences->list[0]);
1249
fences->list = realloc(fences->list, size);
1250
/* Clear the newly-allocated elements. */
1251
memset(fences->list + idx, 0,
1252
increment * sizeof(fences->list[0]));
1253
}
1254
amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1255
}
1256
1257
static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
1258
struct amdgpu_fence *fence)
1259
{
1260
struct amdgpu_cs_context *cs = acs->csc;
1261
1262
/* Detect no-op dependencies only when there is only 1 ring,
1263
* because IBs on one ring are always executed one at a time.
1264
*
1265
* We always want no dependency between back-to-back gfx IBs, because
1266
* we need the parallelism between IBs for good performance.
1267
*/
1268
if ((acs->ring_type == RING_GFX ||
1269
acs->ws->info.num_rings[acs->ring_type] == 1) &&
1270
!amdgpu_fence_is_syncobj(fence) &&
1271
fence->ctx == acs->ctx &&
1272
fence->fence.ip_type == cs->ib[IB_MAIN].ip_type &&
1273
fence->fence.ip_instance == cs->ib[IB_MAIN].ip_instance &&
1274
fence->fence.ring == cs->ib[IB_MAIN].ring)
1275
return true;
1276
1277
return amdgpu_fence_wait((void *)fence, 0, false);
1278
}
1279
1280
static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
1281
struct pipe_fence_handle *pfence,
1282
unsigned dependency_flags)
1283
{
1284
struct amdgpu_cs *acs = amdgpu_cs(rws);
1285
struct amdgpu_cs_context *cs = acs->csc;
1286
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1287
1288
util_queue_fence_wait(&fence->submitted);
1289
1290
if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
1291
/* Syncobjs are not needed here. */
1292
assert(!amdgpu_fence_is_syncobj(fence));
1293
1294
if (acs->ws->info.has_scheduled_fence_dependency &&
1295
dependency_flags & RADEON_DEPENDENCY_START_FENCE)
1296
add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
1297
else
1298
add_fence_to_list(&cs->compute_fence_dependencies, fence);
1299
return;
1300
}
1301
1302
/* Start fences are not needed here. */
1303
assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));
1304
1305
if (is_noop_fence_dependency(acs, fence))
1306
return;
1307
1308
if (amdgpu_fence_is_syncobj(fence))
1309
add_fence_to_list(&cs->syncobj_dependencies, fence);
1310
else
1311
add_fence_to_list(&cs->fence_dependencies, fence);
1312
}
1313
1314
static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
1315
struct amdgpu_cs_buffer *buffer)
1316
{
1317
struct amdgpu_cs_context *cs = acs->csc;
1318
struct amdgpu_winsys_bo *bo = buffer->bo;
1319
unsigned new_num_fences = 0;
1320
1321
for (unsigned j = 0; j < bo->num_fences; ++j) {
1322
struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
1323
1324
if (is_noop_fence_dependency(acs, bo_fence))
1325
continue;
1326
1327
amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
1328
new_num_fences++;
1329
1330
if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
1331
continue;
1332
1333
add_fence_to_list(&cs->fence_dependencies, bo_fence);
1334
}
1335
1336
for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
1337
amdgpu_fence_reference(&bo->fences[j], NULL);
1338
1339
bo->num_fences = new_num_fences;
1340
}
1341
1342
/* Add the given list of fences to the buffer's fence list.
1343
*
1344
* Must be called with the winsys bo_fence_lock held.
1345
*/
1346
void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
1347
unsigned num_fences,
1348
struct pipe_fence_handle **fences)
1349
{
1350
if (bo->num_fences + num_fences > bo->max_fences) {
1351
unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2);
1352
struct pipe_fence_handle **new_fences =
1353
REALLOC(bo->fences,
1354
bo->num_fences * sizeof(*new_fences),
1355
new_max_fences * sizeof(*new_fences));
1356
if (likely(new_fences && new_max_fences < UINT16_MAX)) {
1357
bo->fences = new_fences;
1358
bo->max_fences = new_max_fences;
1359
} else {
1360
unsigned drop;
1361
1362
fprintf(stderr, new_fences ? "amdgpu_add_fences: too many fences, dropping some\n"
1363
: "amdgpu_add_fences: allocation failure, dropping fence(s)\n");
1364
free(new_fences);
1365
1366
if (!bo->num_fences)
1367
return;
1368
1369
bo->num_fences--; /* prefer to keep the most recent fence if possible */
1370
amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
1371
1372
drop = bo->num_fences + num_fences - bo->max_fences;
1373
num_fences -= drop;
1374
fences += drop;
1375
}
1376
}
1377
1378
for (unsigned i = 0; i < num_fences; ++i) {
1379
bo->fences[bo->num_fences] = NULL;
1380
amdgpu_fence_reference(&bo->fences[bo->num_fences], fences[i]);
1381
bo->num_fences++;
1382
}
1383
}
1384
1385
static void amdgpu_add_fence_dependencies_bo_list(struct amdgpu_cs *acs,
1386
struct pipe_fence_handle *fence,
1387
unsigned num_buffers,
1388
struct amdgpu_cs_buffer *buffers)
1389
{
1390
for (unsigned i = 0; i < num_buffers; i++) {
1391
struct amdgpu_cs_buffer *buffer = &buffers[i];
1392
struct amdgpu_winsys_bo *bo = buffer->bo;
1393
1394
amdgpu_add_bo_fence_dependencies(acs, buffer);
1395
p_atomic_inc(&bo->num_active_ioctls);
1396
amdgpu_add_fences(bo, 1, &fence);
1397
}
1398
}
1399
1400
/* Since the kernel driver doesn't synchronize execution between different
1401
* rings automatically, we have to add fence dependencies manually.
1402
*/
1403
static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
1404
{
1405
struct amdgpu_cs_context *cs = acs->csc;
1406
1407
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
1408
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
1409
amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
1410
}
1411
1412
static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1413
struct pipe_fence_handle *fence)
1414
{
1415
struct amdgpu_cs *acs = amdgpu_cs(rws);
1416
struct amdgpu_cs_context *cs = acs->csc;
1417
1418
assert(amdgpu_fence_is_syncobj((struct amdgpu_fence *)fence));
1419
1420
add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1421
}
1422
1423
/* Add backing of sparse buffers to the buffer list.
1424
*
1425
* This is done late, during submission, to keep the buffer list short before
1426
* submit, and to avoid managing fences for the backing buffers.
1427
*/
1428
static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_winsys *ws,
1429
struct amdgpu_cs_context *cs)
1430
{
1431
for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) {
1432
struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i];
1433
struct amdgpu_winsys_bo *bo = buffer->bo;
1434
1435
simple_mtx_lock(&bo->lock);
1436
1437
list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
1438
/* We can directly add the buffer here, because we know that each
1439
* backing buffer occurs only once.
1440
*/
1441
int idx = amdgpu_do_add_real_buffer(ws, cs, backing->bo);
1442
if (idx < 0) {
1443
fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__);
1444
simple_mtx_unlock(&bo->lock);
1445
return false;
1446
}
1447
1448
cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage;
1449
}
1450
1451
simple_mtx_unlock(&bo->lock);
1452
}
1453
1454
return true;
1455
}
1456
1457
static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1458
{
1459
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1460
struct amdgpu_winsys *ws = acs->ws;
1461
struct amdgpu_cs_context *cs = acs->cst;
1462
int i, r;
1463
uint32_t bo_list = 0;
1464
uint64_t seq_no = 0;
1465
bool has_user_fence = amdgpu_cs_has_user_fence(cs);
1466
bool use_bo_list_create = ws->info.drm_minor < 27;
1467
struct drm_amdgpu_bo_list_in bo_list_in;
1468
unsigned initial_num_real_buffers = cs->num_real_buffers;
1469
1470
#if DEBUG
1471
/* Prepare the buffer list. */
1472
if (ws->debug_all_bos) {
1473
/* The buffer list contains all buffers. This is a slow path that
1474
* ensures that no buffer is missing in the BO list.
1475
*/
1476
unsigned num_handles = 0;
1477
struct drm_amdgpu_bo_list_entry *list =
1478
alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1479
struct amdgpu_winsys_bo *bo;
1480
1481
simple_mtx_lock(&ws->global_bo_list_lock);
1482
LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) {
1483
list[num_handles].bo_handle = bo->u.real.kms_handle;
1484
list[num_handles].bo_priority = 0;
1485
++num_handles;
1486
}
1487
1488
r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, list, &bo_list);
1489
simple_mtx_unlock(&ws->global_bo_list_lock);
1490
if (r) {
1491
fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
1492
goto cleanup;
1493
}
1494
} else
1495
#endif
1496
{
1497
if (!amdgpu_add_sparse_backing_buffers(ws, cs)) {
1498
fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n");
1499
r = -ENOMEM;
1500
goto cleanup;
1501
}
1502
1503
struct drm_amdgpu_bo_list_entry *list =
1504
alloca((cs->num_real_buffers + 2) * sizeof(struct drm_amdgpu_bo_list_entry));
1505
1506
unsigned num_handles = 0;
1507
for (i = 0; i < cs->num_real_buffers; ++i) {
1508
struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i];
1509
assert(buffer->u.real.priority_usage != 0);
1510
1511
list[num_handles].bo_handle = buffer->bo->u.real.kms_handle;
1512
list[num_handles].bo_priority = (util_last_bit(buffer->u.real.priority_usage) - 1) / 2;
1513
++num_handles;
1514
}
1515
1516
if (use_bo_list_create) {
1517
/* Legacy path creating the buffer list handle and passing it to the CS ioctl. */
1518
r = amdgpu_bo_list_create_raw(ws->dev, num_handles, list, &bo_list);
1519
if (r) {
1520
fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
1521
goto cleanup;
1522
}
1523
} else {
1524
/* Standard path passing the buffer list via the CS ioctl. */
1525
bo_list_in.operation = ~0;
1526
bo_list_in.list_handle = ~0;
1527
bo_list_in.bo_number = num_handles;
1528
bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1529
bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list;
1530
}
1531
}
1532
1533
if (acs->ring_type == RING_GFX)
1534
ws->gfx_bo_list_counter += cs->num_real_buffers;
1535
1536
if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) {
1537
r = -ECANCELED;
1538
} else {
1539
struct drm_amdgpu_cs_chunk chunks[7];
1540
unsigned num_chunks = 0;
1541
1542
/* BO list */
1543
if (!use_bo_list_create) {
1544
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1545
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1546
chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1547
num_chunks++;
1548
}
1549
1550
/* Fence dependencies. */
1551
unsigned num_dependencies = cs->fence_dependencies.num;
1552
if (num_dependencies) {
1553
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
1554
alloca(num_dependencies * sizeof(*dep_chunk));
1555
1556
for (unsigned i = 0; i < num_dependencies; i++) {
1557
struct amdgpu_fence *fence =
1558
(struct amdgpu_fence*)cs->fence_dependencies.list[i];
1559
1560
assert(util_queue_fence_is_signalled(&fence->submitted));
1561
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
1562
}
1563
1564
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
1565
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
1566
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
1567
num_chunks++;
1568
}
1569
1570
/* Syncobj dependencies. */
1571
unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1572
if (num_syncobj_dependencies) {
1573
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1574
alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1575
1576
for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1577
struct amdgpu_fence *fence =
1578
(struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1579
1580
if (!amdgpu_fence_is_syncobj(fence))
1581
continue;
1582
1583
assert(util_queue_fence_is_signalled(&fence->submitted));
1584
sem_chunk[i].handle = fence->syncobj;
1585
}
1586
1587
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1588
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1589
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1590
num_chunks++;
1591
}
1592
1593
/* Submit the parallel compute IB first. */
1594
if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
1595
unsigned old_num_chunks = num_chunks;
1596
1597
/* Add compute fence dependencies. */
1598
unsigned num_dependencies = cs->compute_fence_dependencies.num;
1599
if (num_dependencies) {
1600
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
1601
alloca(num_dependencies * sizeof(*dep_chunk));
1602
1603
for (unsigned i = 0; i < num_dependencies; i++) {
1604
struct amdgpu_fence *fence =
1605
(struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
1606
1607
assert(util_queue_fence_is_signalled(&fence->submitted));
1608
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
1609
}
1610
1611
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
1612
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
1613
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
1614
num_chunks++;
1615
}
1616
1617
/* Add compute start fence dependencies. */
1618
unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
1619
if (num_start_dependencies) {
1620
struct drm_amdgpu_cs_chunk_dep *dep_chunk =
1621
alloca(num_start_dependencies * sizeof(*dep_chunk));
1622
1623
for (unsigned i = 0; i < num_start_dependencies; i++) {
1624
struct amdgpu_fence *fence =
1625
(struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
1626
1627
assert(util_queue_fence_is_signalled(&fence->submitted));
1628
amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
1629
}
1630
1631
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
1632
chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
1633
chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
1634
num_chunks++;
1635
}
1636
1637
/* Convert from dwords to bytes. */
1638
cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
1639
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1640
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1641
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
1642
num_chunks++;
1643
1644
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
1645
num_chunks, chunks, NULL);
1646
if (r)
1647
goto finalize;
1648
1649
/* Back off the compute chunks. */
1650
num_chunks = old_num_chunks;
1651
}
1652
1653
/* Syncobj signals. */
1654
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
1655
if (num_syncobj_to_signal) {
1656
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1657
alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1658
1659
for (unsigned i = 0; i < num_syncobj_to_signal; i++) {
1660
struct amdgpu_fence *fence =
1661
(struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1662
1663
assert(amdgpu_fence_is_syncobj(fence));
1664
sem_chunk[i].handle = fence->syncobj;
1665
}
1666
1667
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1668
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4
1669
* num_syncobj_to_signal;
1670
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1671
num_chunks++;
1672
}
1673
1674
/* Fence */
1675
if (has_user_fence) {
1676
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1677
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1678
chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1679
num_chunks++;
1680
}
1681
1682
/* IB */
1683
if (cs->ib[IB_PREAMBLE].ib_bytes) {
1684
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1685
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1686
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE];
1687
num_chunks++;
1688
}
1689
1690
/* IB */
1691
cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
1692
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1693
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1694
chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN];
1695
num_chunks++;
1696
1697
if (cs->secure) {
1698
cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1699
cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1700
} else {
1701
cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1702
cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1703
}
1704
1705
assert(num_chunks <= ARRAY_SIZE(chunks));
1706
1707
r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
1708
num_chunks, chunks, &seq_no);
1709
}
1710
finalize:
1711
if (r) {
1712
if (r == -ENOMEM)
1713
fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
1714
else if (r == -ECANCELED)
1715
fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n");
1716
else
1717
fprintf(stderr, "amdgpu: The CS has been rejected, "
1718
"see dmesg for more information (%i).\n", r);
1719
1720
acs->ctx->num_rejected_cs++;
1721
ws->num_total_rejected_cs++;
1722
} else if (!acs->noop) {
1723
/* Success. */
1724
uint64_t *user_fence = NULL;
1725
1726
/* Need to reserve 4 QWORD for user fence:
1727
* QWORD[0]: completed fence
1728
* QWORD[1]: preempted fence
1729
* QWORD[2]: reset fence
1730
* QWORD[3]: preempted then reset
1731
**/
1732
if (has_user_fence)
1733
user_fence = acs->ctx->user_fence_cpu_address_base + acs->ring_type * 4;
1734
amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1735
}
1736
1737
/* Cleanup. */
1738
if (bo_list)
1739
amdgpu_bo_list_destroy_raw(ws->dev, bo_list);
1740
1741
cleanup:
1742
/* If there was an error, signal the fence, because it won't be signalled
1743
* by the hardware. */
1744
if (r || acs->noop)
1745
amdgpu_fence_signalled(cs->fence);
1746
1747
cs->error_code = r;
1748
1749
/* Only decrement num_active_ioctls for those buffers where we incremented it. */
1750
for (i = 0; i < initial_num_real_buffers; i++)
1751
p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls);
1752
for (i = 0; i < cs->num_slab_buffers; i++)
1753
p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
1754
for (i = 0; i < cs->num_sparse_buffers; i++)
1755
p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls);
1756
1757
amdgpu_cs_context_cleanup(ws, cs);
1758
}
1759
1760
/* Make sure the previous submission is completed. */
1761
void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
1762
{
1763
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1764
1765
/* Wait for any pending ioctl of this CS to complete. */
1766
util_queue_fence_wait(&cs->flush_completed);
1767
}
1768
1769
static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
1770
unsigned flags,
1771
struct pipe_fence_handle **fence)
1772
{
1773
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1774
struct amdgpu_winsys *ws = cs->ws;
1775
int error_code = 0;
1776
uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type];
1777
1778
rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
1779
1780
/* Pad the IB according to the mask. */
1781
switch (cs->ring_type) {
1782
case RING_DMA:
1783
if (ws->info.chip_class <= GFX6) {
1784
while (rcs->current.cdw & ib_pad_dw_mask)
1785
radeon_emit(rcs, 0xf0000000); /* NOP packet */
1786
} else {
1787
while (rcs->current.cdw & ib_pad_dw_mask)
1788
radeon_emit(rcs, 0x00000000); /* NOP packet */
1789
}
1790
break;
1791
case RING_GFX:
1792
case RING_COMPUTE:
1793
if (ws->info.gfx_ib_pad_with_type2) {
1794
while (rcs->current.cdw & ib_pad_dw_mask)
1795
radeon_emit(rcs, PKT2_NOP_PAD);
1796
} else {
1797
while (rcs->current.cdw & ib_pad_dw_mask)
1798
radeon_emit(rcs, PKT3_NOP_PAD);
1799
}
1800
if (cs->ring_type == RING_GFX)
1801
ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1802
1803
/* Also pad secondary IBs. */
1804
if (cs->compute_ib.ib_mapped) {
1805
while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask)
1806
radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD);
1807
}
1808
break;
1809
case RING_UVD:
1810
case RING_UVD_ENC:
1811
while (rcs->current.cdw & ib_pad_dw_mask)
1812
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1813
break;
1814
case RING_VCN_JPEG:
1815
if (rcs->current.cdw % 2)
1816
assert(0);
1817
while (rcs->current.cdw & ib_pad_dw_mask) {
1818
radeon_emit(rcs, 0x60000000); /* nop packet */
1819
radeon_emit(rcs, 0x00000000);
1820
}
1821
break;
1822
case RING_VCN_DEC:
1823
while (rcs->current.cdw & ib_pad_dw_mask)
1824
radeon_emit(rcs, 0x81ff); /* nop packet */
1825
break;
1826
default:
1827
break;
1828
}
1829
1830
if (rcs->current.cdw > rcs->current.max_dw) {
1831
fprintf(stderr, "amdgpu: command stream overflowed\n");
1832
}
1833
1834
/* If the CS is not empty or overflowed.... */
1835
if (likely(radeon_emitted(rcs, 0) &&
1836
rcs->current.cdw <= rcs->current.max_dw &&
1837
!(flags & RADEON_FLUSH_NOOP))) {
1838
struct amdgpu_cs_context *cur = cs->csc;
1839
1840
/* Set IB sizes. */
1841
amdgpu_ib_finalize(ws, rcs, &cs->main);
1842
1843
if (cs->compute_ib.ib_mapped)
1844
amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib);
1845
1846
/* Create a fence. */
1847
amdgpu_fence_reference(&cur->fence, NULL);
1848
if (cs->next_fence) {
1849
/* just move the reference */
1850
cur->fence = cs->next_fence;
1851
cs->next_fence = NULL;
1852
} else {
1853
cur->fence = amdgpu_fence_create(cs->ctx,
1854
cur->ib[IB_MAIN].ip_type,
1855
cur->ib[IB_MAIN].ip_instance,
1856
cur->ib[IB_MAIN].ring);
1857
}
1858
if (fence)
1859
amdgpu_fence_reference(fence, cur->fence);
1860
1861
amdgpu_cs_sync_flush(rcs);
1862
1863
/* Prepare buffers.
1864
*
1865
* This fence must be held until the submission is queued to ensure
1866
* that the order of fence dependency updates matches the order of
1867
* submissions.
1868
*/
1869
simple_mtx_lock(&ws->bo_fence_lock);
1870
amdgpu_add_fence_dependencies_bo_lists(cs);
1871
1872
/* Swap command streams. "cst" is going to be submitted. */
1873
cs->csc = cs->cst;
1874
cs->cst = cur;
1875
1876
/* Submit. */
1877
util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
1878
amdgpu_cs_submit_ib, NULL, 0);
1879
1880
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1881
cs->csc->secure = !cs->cst->secure;
1882
else
1883
cs->csc->secure = cs->cst->secure;
1884
1885
/* The submission has been queued, unlock the fence now. */
1886
simple_mtx_unlock(&ws->bo_fence_lock);
1887
1888
if (!(flags & PIPE_FLUSH_ASYNC)) {
1889
amdgpu_cs_sync_flush(rcs);
1890
error_code = cur->error_code;
1891
}
1892
} else {
1893
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1894
cs->csc->secure = !cs->csc->secure;
1895
amdgpu_cs_context_cleanup(ws, cs->csc);
1896
}
1897
1898
memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1899
1900
amdgpu_get_new_ib(ws, rcs, &cs->main, cs);
1901
if (cs->compute_ib.ib_mapped)
1902
amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs);
1903
1904
if (cs->preamble_ib_bo) {
1905
amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0,
1906
RADEON_PRIO_IB1);
1907
}
1908
1909
rcs->used_gart_kb = 0;
1910
rcs->used_vram_kb = 0;
1911
1912
if (cs->ring_type == RING_GFX)
1913
ws->num_gfx_IBs++;
1914
else if (cs->ring_type == RING_DMA)
1915
ws->num_sdma_IBs++;
1916
1917
return error_code;
1918
}
1919
1920
static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
1921
{
1922
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1923
1924
if (!cs)
1925
return;
1926
1927
amdgpu_cs_sync_flush(rcs);
1928
util_queue_fence_destroy(&cs->flush_completed);
1929
p_atomic_dec(&cs->ws->num_cs);
1930
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL);
1931
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL);
1932
FREE(rcs->prev);
1933
radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL);
1934
if (cs->compute_ib.rcs)
1935
FREE(cs->compute_ib.rcs->prev);
1936
amdgpu_destroy_cs_context(cs->ws, &cs->csc1);
1937
amdgpu_destroy_cs_context(cs->ws, &cs->csc2);
1938
amdgpu_fence_reference(&cs->next_fence, NULL);
1939
FREE(cs);
1940
}
1941
1942
static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
1943
struct pb_buffer *_buf,
1944
enum radeon_bo_usage usage)
1945
{
1946
struct amdgpu_cs *cs = amdgpu_cs(rcs);
1947
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1948
1949
return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1950
}
1951
1952
void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws)
1953
{
1954
ws->base.ctx_create = amdgpu_ctx_create;
1955
ws->base.ctx_destroy = amdgpu_ctx_destroy;
1956
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1957
ws->base.cs_create = amdgpu_cs_create;
1958
ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
1959
ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
1960
ws->base.cs_destroy = amdgpu_cs_destroy;
1961
ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1962
ws->base.cs_validate = amdgpu_cs_validate;
1963
ws->base.cs_check_space = amdgpu_cs_check_space;
1964
ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1965
ws->base.cs_flush = amdgpu_cs_flush;
1966
ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1967
ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1968
ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1969
ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1970
ws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
1971
ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1972
ws->base.fence_reference = amdgpu_fence_reference;
1973
ws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
1974
ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1975
ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1976
ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1977
}
1978
1979