Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
4566 views
1
/*
2
* Copyright © 2008 Jérôme Glisse
3
* Copyright © 2010 Marek Olšák <[email protected]>
4
* All Rights Reserved.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining
7
* a copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
13
*
14
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
* NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18
* AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
22
*
23
* The above copyright notice and this permission notice (including the
24
* next paragraph) shall be included in all copies or substantial portions
25
* of the Software.
26
*/
27
28
/*
29
This file replaces libdrm's radeon_cs_gem with our own implemention.
30
It's optimized specifically for Radeon DRM.
31
Adding buffers and space checking are faster and simpler than their
32
counterparts in libdrm (the time complexity of all the functions
33
is O(1) in nearly all scenarios, thanks to hashing).
34
35
It works like this:
36
37
cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38
also adds the size of 'buf' to the used_gart and used_vram winsys variables
39
based on the domains, which are simply or'd for the accounting purposes.
40
The adding is skipped if the reloc is already present in the list, but it
41
accounts any newly-referenced domains.
42
43
cs_validate is then called, which just checks:
44
used_vram/gart < vram/gart_size * 0.8
45
The 0.8 number allows for some memory fragmentation. If the validation
46
fails, the pipe driver flushes CS and tries do the validation again,
47
i.e. it validates only that one operation. If it fails again, it drops
48
the operation on the floor and prints some nasty message to stderr.
49
(done in the pipe driver)
50
51
cs_write_reloc(cs, buf) just writes a reloc that has been added using
52
cs_add_buffer. The read_domain and write_domain parameters have been removed,
53
because we already specify them in cs_add_buffer.
54
*/
55
56
#include "radeon_drm_cs.h"
57
58
#include "util/u_memory.h"
59
#include "util/os_time.h"
60
61
#include <stdio.h>
62
#include <stdlib.h>
63
#include <stdint.h>
64
#include <xf86drm.h>
65
66
67
#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70
static void radeon_fence_reference(struct pipe_fence_handle **dst,
71
struct pipe_fence_handle *src);
72
73
static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74
{
75
struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76
if (!ctx)
77
return NULL;
78
79
ctx->ws = (struct radeon_drm_winsys*)ws;
80
ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81
return (struct radeon_winsys_ctx*)ctx;
82
}
83
84
static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85
{
86
FREE(ctx);
87
}
88
89
static enum pipe_reset_status
90
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
91
bool *needs_reset)
92
{
93
struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
94
95
unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
96
97
if (ctx->gpu_reset_counter == latest) {
98
if (needs_reset)
99
*needs_reset = false;
100
return PIPE_NO_RESET;
101
}
102
103
if (needs_reset)
104
*needs_reset = true;
105
106
ctx->gpu_reset_counter = latest;
107
return PIPE_UNKNOWN_CONTEXT_RESET;
108
}
109
110
static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111
struct radeon_drm_winsys *ws)
112
{
113
int i;
114
115
csc->fd = ws->fd;
116
117
csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118
csc->chunks[0].length_dw = 0;
119
csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120
csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121
csc->chunks[1].length_dw = 0;
122
csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123
csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124
csc->chunks[2].length_dw = 2;
125
csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
126
127
csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128
csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129
csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
130
131
csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
132
133
for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134
csc->reloc_indices_hashlist[i] = -1;
135
}
136
return true;
137
}
138
139
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
140
{
141
unsigned i;
142
143
for (i = 0; i < csc->num_relocs; i++) {
144
p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145
radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
146
}
147
for (i = 0; i < csc->num_slab_buffers; ++i) {
148
p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149
radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
150
}
151
152
csc->num_relocs = 0;
153
csc->num_validated_relocs = 0;
154
csc->num_slab_buffers = 0;
155
csc->chunks[0].length_dw = 0;
156
csc->chunks[1].length_dw = 0;
157
158
for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159
csc->reloc_indices_hashlist[i] = -1;
160
}
161
}
162
163
static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
164
{
165
radeon_cs_context_cleanup(csc);
166
FREE(csc->slab_buffers);
167
FREE(csc->relocs_bo);
168
FREE(csc->relocs);
169
}
170
171
172
static bool
173
radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174
struct radeon_winsys_ctx *ctx,
175
enum ring_type ring_type,
176
void (*flush)(void *ctx, unsigned flags,
177
struct pipe_fence_handle **fence),
178
void *flush_ctx,
179
bool stop_exec_on_failure)
180
{
181
struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182
struct radeon_drm_cs *cs;
183
184
cs = CALLOC_STRUCT(radeon_drm_cs);
185
if (!cs) {
186
return false;
187
}
188
util_queue_fence_init(&cs->flush_completed);
189
190
cs->ws = ws;
191
cs->flush_cs = flush;
192
cs->flush_data = flush_ctx;
193
194
if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
195
FREE(cs);
196
return false;
197
}
198
if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199
radeon_destroy_cs_context(&cs->csc1);
200
FREE(cs);
201
return false;
202
}
203
204
/* Set the first command buffer as current. */
205
cs->csc = &cs->csc1;
206
cs->cst = &cs->csc2;
207
cs->ring_type = ring_type;
208
209
memset(rcs, 0, sizeof(*rcs));
210
rcs->current.buf = cs->csc->buf;
211
rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
212
rcs->priv = cs;
213
214
p_atomic_inc(&ws->num_cs);
215
return true;
216
}
217
218
int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
219
{
220
unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221
struct radeon_bo_item *buffers;
222
unsigned num_buffers;
223
int i = csc->reloc_indices_hashlist[hash];
224
225
if (bo->handle) {
226
buffers = csc->relocs_bo;
227
num_buffers = csc->num_relocs;
228
} else {
229
buffers = csc->slab_buffers;
230
num_buffers = csc->num_slab_buffers;
231
}
232
233
/* not found or found */
234
if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
235
return i;
236
237
/* Hash collision, look for the BO in the list of relocs linearly. */
238
for (i = num_buffers - 1; i >= 0; i--) {
239
if (buffers[i].bo == bo) {
240
/* Put this reloc in the hash list.
241
* This will prevent additional hash collisions if there are
242
* several consecutive lookup_buffer calls for the same buffer.
243
*
244
* Example: Assuming buffers A,B,C collide in the hash list,
245
* the following sequence of relocs:
246
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247
* will collide here: ^ and here: ^,
248
* meaning that we should get very few collisions in the end. */
249
csc->reloc_indices_hashlist[hash] = i;
250
return i;
251
}
252
}
253
return -1;
254
}
255
256
static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257
struct radeon_bo *bo)
258
{
259
struct radeon_cs_context *csc = cs->csc;
260
struct drm_radeon_cs_reloc *reloc;
261
unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
262
int i = -1;
263
264
i = radeon_lookup_buffer(csc, bo);
265
266
if (i >= 0) {
267
/* For async DMA, every add_buffer call must add a buffer to the list
268
* no matter how many duplicates there are. This is due to the fact
269
* the DMA CS checker doesn't use NOP packets for offset patching,
270
* but always uses the i-th buffer from the list to patch the i-th
271
* offset. If there are N offsets in a DMA CS, there must also be N
272
* buffers in the relocation list.
273
*
274
* This doesn't have to be done if virtual memory is enabled,
275
* because there is no offset patching with virtual memory.
276
*/
277
if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
278
return i;
279
}
280
}
281
282
/* New relocation, check if the backing array is large enough. */
283
if (csc->num_relocs >= csc->max_relocs) {
284
uint32_t size;
285
csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
286
287
size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288
csc->relocs_bo = realloc(csc->relocs_bo, size);
289
290
size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291
csc->relocs = realloc(csc->relocs, size);
292
293
csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
294
}
295
296
/* Initialize the new relocation. */
297
csc->relocs_bo[csc->num_relocs].bo = NULL;
298
csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299
radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300
p_atomic_inc(&bo->num_cs_references);
301
reloc = &csc->relocs[csc->num_relocs];
302
reloc->handle = bo->handle;
303
reloc->read_domains = 0;
304
reloc->write_domain = 0;
305
reloc->flags = 0;
306
307
csc->reloc_indices_hashlist[hash] = csc->num_relocs;
308
309
csc->chunks[1].length_dw += RELOC_DWORDS;
310
311
return csc->num_relocs++;
312
}
313
314
static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315
struct radeon_bo *bo)
316
{
317
struct radeon_cs_context *csc = cs->csc;
318
unsigned hash;
319
struct radeon_bo_item *item;
320
int idx;
321
int real_idx;
322
323
idx = radeon_lookup_buffer(csc, bo);
324
if (idx >= 0)
325
return idx;
326
327
real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
328
329
/* Check if the backing array is large enough. */
330
if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331
unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332
(unsigned)(csc->max_slab_buffers * 1.3));
333
struct radeon_bo_item *new_buffers =
334
REALLOC(csc->slab_buffers,
335
csc->max_slab_buffers * sizeof(*new_buffers),
336
new_max * sizeof(*new_buffers));
337
if (!new_buffers) {
338
fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
339
return -1;
340
}
341
342
csc->max_slab_buffers = new_max;
343
csc->slab_buffers = new_buffers;
344
}
345
346
/* Initialize the new relocation. */
347
idx = csc->num_slab_buffers++;
348
item = &csc->slab_buffers[idx];
349
350
item->bo = NULL;
351
item->u.slab.real_idx = real_idx;
352
radeon_ws_bo_reference(&item->bo, bo);
353
p_atomic_inc(&bo->num_cs_references);
354
355
hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356
csc->reloc_indices_hashlist[hash] = idx;
357
358
return idx;
359
}
360
361
static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362
struct pb_buffer *buf,
363
enum radeon_bo_usage usage,
364
enum radeon_bo_domain domains,
365
enum radeon_bo_priority priority)
366
{
367
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
368
struct radeon_bo *bo = (struct radeon_bo*)buf;
369
enum radeon_bo_domain added_domains;
370
371
/* If VRAM is just stolen system memory, allow both VRAM and
372
* GTT, whichever has free space. If a buffer is evicted from
373
* VRAM to GTT, it will stay there.
374
*/
375
if (!cs->ws->info.has_dedicated_vram)
376
domains |= RADEON_DOMAIN_GTT;
377
378
enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
379
enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
380
struct drm_radeon_cs_reloc *reloc;
381
int index;
382
383
if (!bo->handle) {
384
index = radeon_lookup_or_add_slab_buffer(cs, bo);
385
if (index < 0)
386
return 0;
387
388
index = cs->csc->slab_buffers[index].u.slab.real_idx;
389
} else {
390
index = radeon_lookup_or_add_real_buffer(cs, bo);
391
}
392
393
reloc = &cs->csc->relocs[index];
394
added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
395
reloc->read_domains |= rd;
396
reloc->write_domain |= wd;
397
reloc->flags = MAX2(reloc->flags, priority);
398
cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
399
400
if (added_domains & RADEON_DOMAIN_VRAM)
401
rcs->used_vram_kb += bo->base.size / 1024;
402
else if (added_domains & RADEON_DOMAIN_GTT)
403
rcs->used_gart_kb += bo->base.size / 1024;
404
405
return index;
406
}
407
408
static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
409
struct pb_buffer *buf)
410
{
411
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
412
413
return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
414
}
415
416
static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
417
{
418
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
419
bool status =
420
rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
421
rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
422
423
if (status) {
424
cs->csc->num_validated_relocs = cs->csc->num_relocs;
425
} else {
426
/* Remove lately-added buffers. The validation failed with them
427
* and the CS is about to be flushed because of that. Keep only
428
* the already-validated buffers. */
429
unsigned i;
430
431
for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
432
p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
433
radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
434
}
435
cs->csc->num_relocs = cs->csc->num_validated_relocs;
436
437
/* Flush if there are any relocs. Clean up otherwise. */
438
if (cs->csc->num_relocs) {
439
cs->flush_cs(cs->flush_data,
440
RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
441
} else {
442
radeon_cs_context_cleanup(cs->csc);
443
rcs->used_vram_kb = 0;
444
rcs->used_gart_kb = 0;
445
446
assert(rcs->current.cdw == 0);
447
if (rcs->current.cdw != 0) {
448
fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
449
}
450
}
451
}
452
return status;
453
}
454
455
static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
456
bool force_chaining)
457
{
458
assert(rcs->current.cdw <= rcs->current.max_dw);
459
return rcs->current.max_dw - rcs->current.cdw >= dw;
460
}
461
462
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
463
struct radeon_bo_list_item *list)
464
{
465
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466
int i;
467
468
if (list) {
469
for (i = 0; i < cs->csc->num_relocs; i++) {
470
list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
471
list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
472
list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
473
}
474
}
475
return cs->csc->num_relocs;
476
}
477
478
void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
479
{
480
struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
481
unsigned i;
482
int r;
483
484
r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
485
&csc->cs, sizeof(struct drm_radeon_cs));
486
if (r) {
487
if (r == -ENOMEM)
488
fprintf(stderr, "radeon: Not enough memory for command submission.\n");
489
else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
490
unsigned i;
491
492
fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
493
for (i = 0; i < csc->chunks[0].length_dw; i++) {
494
fprintf(stderr, "0x%08X\n", csc->buf[i]);
495
}
496
} else {
497
fprintf(stderr, "radeon: The kernel rejected CS, "
498
"see dmesg for more information (%i).\n", r);
499
}
500
}
501
502
for (i = 0; i < csc->num_relocs; i++)
503
p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
504
for (i = 0; i < csc->num_slab_buffers; i++)
505
p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
506
507
radeon_cs_context_cleanup(csc);
508
}
509
510
/*
511
* Make sure previous submission of this cs are completed
512
*/
513
void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
514
{
515
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516
517
/* Wait for any pending ioctl of this CS to complete. */
518
if (util_queue_is_initialized(&cs->ws->cs_queue))
519
util_queue_fence_wait(&cs->flush_completed);
520
}
521
522
/* Add the given fence to a slab buffer fence list.
523
*
524
* There is a potential race condition when bo participates in submissions on
525
* two or more threads simultaneously. Since we do not know which of the
526
* submissions will be sent to the GPU first, we have to keep the fences
527
* of all submissions.
528
*
529
* However, fences that belong to submissions that have already returned from
530
* their respective ioctl do not have to be kept, because we know that they
531
* will signal earlier.
532
*/
533
static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
534
{
535
unsigned dst;
536
537
assert(fence->num_cs_references);
538
539
/* Cleanup older fences */
540
dst = 0;
541
for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
542
if (bo->u.slab.fences[src]->num_cs_references) {
543
bo->u.slab.fences[dst] = bo->u.slab.fences[src];
544
dst++;
545
} else {
546
radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
547
}
548
}
549
bo->u.slab.num_fences = dst;
550
551
/* Check available space for the new fence */
552
if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
553
unsigned new_max_fences = bo->u.slab.max_fences + 1;
554
struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
555
bo->u.slab.max_fences * sizeof(*new_fences),
556
new_max_fences * sizeof(*new_fences));
557
if (!new_fences) {
558
fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
559
return;
560
}
561
562
bo->u.slab.fences = new_fences;
563
bo->u.slab.max_fences = new_max_fences;
564
}
565
566
/* Add the new fence */
567
bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
568
radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
569
bo->u.slab.num_fences++;
570
}
571
572
static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
573
unsigned flags,
574
struct pipe_fence_handle **pfence)
575
{
576
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
577
struct radeon_cs_context *tmp;
578
579
switch (cs->ring_type) {
580
case RING_DMA:
581
/* pad DMA ring to 8 DWs */
582
if (cs->ws->info.chip_class <= GFX6) {
583
while (rcs->current.cdw & 7)
584
radeon_emit(rcs, 0xf0000000); /* NOP packet */
585
} else {
586
while (rcs->current.cdw & 7)
587
radeon_emit(rcs, 0x00000000); /* NOP packet */
588
}
589
break;
590
case RING_GFX:
591
/* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
592
* r6xx, requires at least 4 dw alignment to avoid a hw bug.
593
*/
594
if (cs->ws->info.gfx_ib_pad_with_type2) {
595
while (rcs->current.cdw & 7)
596
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
597
} else {
598
while (rcs->current.cdw & 7)
599
radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
600
}
601
break;
602
case RING_UVD:
603
while (rcs->current.cdw & 15)
604
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
605
break;
606
default:
607
break;
608
}
609
610
if (rcs->current.cdw > rcs->current.max_dw) {
611
fprintf(stderr, "radeon: command stream overflowed\n");
612
}
613
614
if (pfence || cs->csc->num_slab_buffers) {
615
struct pipe_fence_handle *fence;
616
617
if (cs->next_fence) {
618
fence = cs->next_fence;
619
cs->next_fence = NULL;
620
} else {
621
fence = radeon_cs_create_fence(rcs);
622
}
623
624
if (fence) {
625
if (pfence)
626
radeon_fence_reference(pfence, fence);
627
628
mtx_lock(&cs->ws->bo_fence_lock);
629
for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
630
struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
631
p_atomic_inc(&bo->num_active_ioctls);
632
radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
633
}
634
mtx_unlock(&cs->ws->bo_fence_lock);
635
636
radeon_fence_reference(&fence, NULL);
637
}
638
} else {
639
radeon_fence_reference(&cs->next_fence, NULL);
640
}
641
642
radeon_drm_cs_sync_flush(rcs);
643
644
/* Swap command streams. */
645
tmp = cs->csc;
646
cs->csc = cs->cst;
647
cs->cst = tmp;
648
649
/* If the CS is not empty or overflowed, emit it in a separate thread. */
650
if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
651
!cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
652
unsigned i, num_relocs;
653
654
num_relocs = cs->cst->num_relocs;
655
656
cs->cst->chunks[0].length_dw = rcs->current.cdw;
657
658
for (i = 0; i < num_relocs; i++) {
659
/* Update the number of active asynchronous CS ioctls for the buffer. */
660
p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
661
}
662
663
switch (cs->ring_type) {
664
case RING_DMA:
665
cs->cst->flags[0] = 0;
666
cs->cst->flags[1] = RADEON_CS_RING_DMA;
667
cs->cst->cs.num_chunks = 3;
668
if (cs->ws->info.r600_has_virtual_memory) {
669
cs->cst->flags[0] |= RADEON_CS_USE_VM;
670
}
671
break;
672
673
case RING_UVD:
674
cs->cst->flags[0] = 0;
675
cs->cst->flags[1] = RADEON_CS_RING_UVD;
676
cs->cst->cs.num_chunks = 3;
677
break;
678
679
case RING_VCE:
680
cs->cst->flags[0] = 0;
681
cs->cst->flags[1] = RADEON_CS_RING_VCE;
682
cs->cst->cs.num_chunks = 3;
683
break;
684
685
default:
686
case RING_GFX:
687
case RING_COMPUTE:
688
cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
689
cs->cst->flags[1] = RADEON_CS_RING_GFX;
690
cs->cst->cs.num_chunks = 3;
691
692
if (cs->ws->info.r600_has_virtual_memory) {
693
cs->cst->flags[0] |= RADEON_CS_USE_VM;
694
cs->cst->cs.num_chunks = 3;
695
}
696
if (flags & PIPE_FLUSH_END_OF_FRAME) {
697
cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
698
cs->cst->cs.num_chunks = 3;
699
}
700
if (cs->ring_type == RING_COMPUTE) {
701
cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
702
cs->cst->cs.num_chunks = 3;
703
}
704
break;
705
}
706
707
if (util_queue_is_initialized(&cs->ws->cs_queue)) {
708
util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
709
radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
710
if (!(flags & PIPE_FLUSH_ASYNC))
711
radeon_drm_cs_sync_flush(rcs);
712
} else {
713
radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
714
}
715
} else {
716
radeon_cs_context_cleanup(cs->cst);
717
}
718
719
/* Prepare a new CS. */
720
rcs->current.buf = cs->csc->buf;
721
rcs->current.cdw = 0;
722
rcs->used_vram_kb = 0;
723
rcs->used_gart_kb = 0;
724
725
if (cs->ring_type == RING_GFX)
726
cs->ws->num_gfx_IBs++;
727
else if (cs->ring_type == RING_DMA)
728
cs->ws->num_sdma_IBs++;
729
return 0;
730
}
731
732
static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
733
{
734
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
735
736
if (!cs)
737
return;
738
739
radeon_drm_cs_sync_flush(rcs);
740
util_queue_fence_destroy(&cs->flush_completed);
741
radeon_cs_context_cleanup(&cs->csc1);
742
radeon_cs_context_cleanup(&cs->csc2);
743
p_atomic_dec(&cs->ws->num_cs);
744
radeon_destroy_cs_context(&cs->csc1);
745
radeon_destroy_cs_context(&cs->csc2);
746
radeon_fence_reference(&cs->next_fence, NULL);
747
FREE(cs);
748
}
749
750
static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
751
struct pb_buffer *_buf,
752
enum radeon_bo_usage usage)
753
{
754
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
755
struct radeon_bo *bo = (struct radeon_bo*)_buf;
756
int index;
757
758
if (!bo->num_cs_references)
759
return false;
760
761
index = radeon_lookup_buffer(cs->csc, bo);
762
if (index == -1)
763
return false;
764
765
if (!bo->handle)
766
index = cs->csc->slab_buffers[index].u.slab.real_idx;
767
768
if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
769
return true;
770
if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
771
return true;
772
773
return false;
774
}
775
776
/* FENCES */
777
778
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
779
{
780
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
781
struct pb_buffer *fence;
782
783
/* Create a fence, which is a dummy BO. */
784
fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
785
RADEON_DOMAIN_GTT,
786
RADEON_FLAG_NO_SUBALLOC
787
| RADEON_FLAG_NO_INTERPROCESS_SHARING);
788
if (!fence)
789
return NULL;
790
791
/* Add the fence as a dummy relocation. */
792
cs->ws->base.cs_add_buffer(rcs, fence,
793
RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
794
RADEON_PRIO_FENCE);
795
return (struct pipe_fence_handle*)fence;
796
}
797
798
static bool radeon_fence_wait(struct radeon_winsys *ws,
799
struct pipe_fence_handle *fence,
800
uint64_t timeout)
801
{
802
return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
803
RADEON_USAGE_READWRITE);
804
}
805
806
static void radeon_fence_reference(struct pipe_fence_handle **dst,
807
struct pipe_fence_handle *src)
808
{
809
pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
810
}
811
812
static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
813
{
814
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
815
struct pipe_fence_handle *fence = NULL;
816
817
if (cs->next_fence) {
818
radeon_fence_reference(&fence, cs->next_fence);
819
return fence;
820
}
821
822
fence = radeon_cs_create_fence(rcs);
823
if (!fence)
824
return NULL;
825
826
radeon_fence_reference(&cs->next_fence, fence);
827
return fence;
828
}
829
830
static void
831
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
832
struct pipe_fence_handle *fence,
833
unsigned dependency_flags)
834
{
835
/* TODO: Handle the following unlikely multi-threaded scenario:
836
*
837
* Thread 1 / Context 1 Thread 2 / Context 2
838
* -------------------- --------------------
839
* f = cs_get_next_fence()
840
* cs_add_fence_dependency(f)
841
* cs_flush()
842
* cs_flush()
843
*
844
* We currently assume that this does not happen because we don't support
845
* asynchronous flushes on Radeon.
846
*/
847
}
848
849
void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
850
{
851
ws->base.ctx_create = radeon_drm_ctx_create;
852
ws->base.ctx_destroy = radeon_drm_ctx_destroy;
853
ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
854
ws->base.cs_create = radeon_drm_cs_create;
855
ws->base.cs_destroy = radeon_drm_cs_destroy;
856
ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
857
ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
858
ws->base.cs_validate = radeon_drm_cs_validate;
859
ws->base.cs_check_space = radeon_drm_cs_check_space;
860
ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
861
ws->base.cs_flush = radeon_drm_cs_flush;
862
ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
863
ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
864
ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
865
ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
866
ws->base.fence_wait = radeon_fence_wait;
867
ws->base.fence_reference = radeon_fence_reference;
868
}
869
870