CoCalc -- radeon_drm

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
⁴⁵⁶⁶ views
1
/*
2
 * Copyright © 2008 Jérôme Glisse
3
 * Copyright © 2010 Marek Olšák <[email protected]>
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining
7
 * a copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18
 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22
 *
23
 * The above copyright notice and this permission notice (including the
24
 * next paragraph) shall be included in all copies or substantial portions
25
 * of the Software.
26
 */
27

28
/*
29
    This file replaces libdrm's radeon_cs_gem with our own implemention.
30
    It's optimized specifically for Radeon DRM.
31
    Adding buffers and space checking are faster and simpler than their
32
    counterparts in libdrm (the time complexity of all the functions
33
    is O(1) in nearly all scenarios, thanks to hashing).
34

35
    It works like this:
36

37
    cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38
    also adds the size of 'buf' to the used_gart and used_vram winsys variables
39
    based on the domains, which are simply or'd for the accounting purposes.
40
    The adding is skipped if the reloc is already present in the list, but it
41
    accounts any newly-referenced domains.
42

43
    cs_validate is then called, which just checks:
44
        used_vram/gart < vram/gart_size * 0.8
45
    The 0.8 number allows for some memory fragmentation. If the validation
46
    fails, the pipe driver flushes CS and tries do the validation again,
47
    i.e. it validates only that one operation. If it fails again, it drops
48
    the operation on the floor and prints some nasty message to stderr.
49
    (done in the pipe driver)
50

51
    cs_write_reloc(cs, buf) just writes a reloc that has been added using
52
    cs_add_buffer. The read_domain and write_domain parameters have been removed,
53
    because we already specify them in cs_add_buffer.
54
*/
55

56
#include "radeon_drm_cs.h"
57

58
#include "util/u_memory.h"
59
#include "util/os_time.h"
60

61
#include <stdio.h>
62
#include <stdlib.h>
63
#include <stdint.h>
64
#include <xf86drm.h>
65

66

67
#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68

69
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70
static void radeon_fence_reference(struct pipe_fence_handle **dst,
71
                                   struct pipe_fence_handle *src);
72

73
static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74
{
75
   struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76
   if (!ctx)
77
      return NULL;
78

79
   ctx->ws = (struct radeon_drm_winsys*)ws;
80
   ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81
   return (struct radeon_winsys_ctx*)ctx;
82
}
83

84
static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85
{
86
   FREE(ctx);
87
}
88

89
static enum pipe_reset_status
90
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
91
                                  bool *needs_reset)
92
{
93
   struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
94

95
   unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
96

97
   if (ctx->gpu_reset_counter == latest) {
98
      if (needs_reset)
99
         *needs_reset = false;
100
      return PIPE_NO_RESET;
101
   }
102

103
   if (needs_reset)
104
      *needs_reset = true;
105

106
   ctx->gpu_reset_counter = latest;
107
   return PIPE_UNKNOWN_CONTEXT_RESET;
108
}
109

110
static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111
                                   struct radeon_drm_winsys *ws)
112
{
113
   int i;
114

115
   csc->fd = ws->fd;
116

117
   csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118
   csc->chunks[0].length_dw = 0;
119
   csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120
   csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121
   csc->chunks[1].length_dw = 0;
122
   csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123
   csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124
   csc->chunks[2].length_dw = 2;
125
   csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
126

127
   csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128
   csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129
   csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
130

131
   csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
132

133
   for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134
      csc->reloc_indices_hashlist[i] = -1;
135
   }
136
   return true;
137
}
138

139
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
140
{
141
   unsigned i;
142

143
   for (i = 0; i < csc->num_relocs; i++) {
144
      p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145
      radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
146
   }
147
   for (i = 0; i < csc->num_slab_buffers; ++i) {
148
      p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149
      radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
150
   }
151

152
   csc->num_relocs = 0;
153
   csc->num_validated_relocs = 0;
154
   csc->num_slab_buffers = 0;
155
   csc->chunks[0].length_dw = 0;
156
   csc->chunks[1].length_dw = 0;
157

158
   for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159
      csc->reloc_indices_hashlist[i] = -1;
160
   }
161
}
162

163
static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
164
{
165
   radeon_cs_context_cleanup(csc);
166
   FREE(csc->slab_buffers);
167
   FREE(csc->relocs_bo);
168
   FREE(csc->relocs);
169
}
170

171

172
static bool
173
radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174
                     struct radeon_winsys_ctx *ctx,
175
                     enum ring_type ring_type,
176
                     void (*flush)(void *ctx, unsigned flags,
177
                                   struct pipe_fence_handle **fence),
178
                     void *flush_ctx,
179
                     bool stop_exec_on_failure)
180
{
181
   struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182
   struct radeon_drm_cs *cs;
183

184
   cs = CALLOC_STRUCT(radeon_drm_cs);
185
   if (!cs) {
186
      return false;
187
   }
188
   util_queue_fence_init(&cs->flush_completed);
189

190
   cs->ws = ws;
191
   cs->flush_cs = flush;
192
   cs->flush_data = flush_ctx;
193

194
   if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
195
      FREE(cs);
196
      return false;
197
   }
198
   if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199
      radeon_destroy_cs_context(&cs->csc1);
200
      FREE(cs);
201
      return false;
202
   }
203

204
   /* Set the first command buffer as current. */
205
   cs->csc = &cs->csc1;
206
   cs->cst = &cs->csc2;
207
   cs->ring_type = ring_type;
208

209
   memset(rcs, 0, sizeof(*rcs));
210
   rcs->current.buf = cs->csc->buf;
211
   rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
212
   rcs->priv = cs;
213

214
   p_atomic_inc(&ws->num_cs);
215
   return true;
216
}
217

218
int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
219
{
220
   unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221
   struct radeon_bo_item *buffers;
222
   unsigned num_buffers;
223
   int i = csc->reloc_indices_hashlist[hash];
224

225
   if (bo->handle) {
226
      buffers = csc->relocs_bo;
227
      num_buffers = csc->num_relocs;
228
   } else {
229
      buffers = csc->slab_buffers;
230
      num_buffers = csc->num_slab_buffers;
231
   }
232

233
   /* not found or found */
234
   if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
235
      return i;
236

237
   /* Hash collision, look for the BO in the list of relocs linearly. */
238
   for (i = num_buffers - 1; i >= 0; i--) {
239
      if (buffers[i].bo == bo) {
240
         /* Put this reloc in the hash list.
241
          * This will prevent additional hash collisions if there are
242
          * several consecutive lookup_buffer calls for the same buffer.
243
          *
244
          * Example: Assuming buffers A,B,C collide in the hash list,
245
          * the following sequence of relocs:
246
          *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247
          * will collide here: ^ and here:   ^,
248
          * meaning that we should get very few collisions in the end. */
249
         csc->reloc_indices_hashlist[hash] = i;
250
         return i;
251
      }
252
   }
253
   return -1;
254
}
255

256
static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257
                                                 struct radeon_bo *bo)
258
{
259
   struct radeon_cs_context *csc = cs->csc;
260
   struct drm_radeon_cs_reloc *reloc;
261
   unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
262
   int i = -1;
263

264
   i = radeon_lookup_buffer(csc, bo);
265

266
   if (i >= 0) {
267
      /* For async DMA, every add_buffer call must add a buffer to the list
268
       * no matter how many duplicates there are. This is due to the fact
269
       * the DMA CS checker doesn't use NOP packets for offset patching,
270
       * but always uses the i-th buffer from the list to patch the i-th
271
       * offset. If there are N offsets in a DMA CS, there must also be N
272
       * buffers in the relocation list.
273
       *
274
       * This doesn't have to be done if virtual memory is enabled,
275
       * because there is no offset patching with virtual memory.
276
       */
277
      if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
278
         return i;
279
      }
280
   }
281

282
   /* New relocation, check if the backing array is large enough. */
283
   if (csc->num_relocs >= csc->max_relocs) {
284
      uint32_t size;
285
      csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
286

287
      size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288
      csc->relocs_bo = realloc(csc->relocs_bo, size);
289

290
      size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291
      csc->relocs = realloc(csc->relocs, size);
292

293
      csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
294
   }
295

296
   /* Initialize the new relocation. */
297
   csc->relocs_bo[csc->num_relocs].bo = NULL;
298
   csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299
   radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300
   p_atomic_inc(&bo->num_cs_references);
301
   reloc = &csc->relocs[csc->num_relocs];
302
   reloc->handle = bo->handle;
303
   reloc->read_domains = 0;
304
   reloc->write_domain = 0;
305
   reloc->flags = 0;
306

307
   csc->reloc_indices_hashlist[hash] = csc->num_relocs;
308

309
   csc->chunks[1].length_dw += RELOC_DWORDS;
310

311
   return csc->num_relocs++;
312
}
313

314
static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315
                                            struct radeon_bo *bo)
316
{
317
   struct radeon_cs_context *csc = cs->csc;
318
   unsigned hash;
319
   struct radeon_bo_item *item;
320
   int idx;
321
   int real_idx;
322

323
   idx = radeon_lookup_buffer(csc, bo);
324
   if (idx >= 0)
325
      return idx;
326

327
   real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
328

329
   /* Check if the backing array is large enough. */
330
   if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331
      unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332
                              (unsigned)(csc->max_slab_buffers * 1.3));
333
      struct radeon_bo_item *new_buffers =
334
            REALLOC(csc->slab_buffers,
335
                    csc->max_slab_buffers * sizeof(*new_buffers),
336
                    new_max * sizeof(*new_buffers));
337
      if (!new_buffers) {
338
         fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
339
         return -1;
340
      }
341

342
      csc->max_slab_buffers = new_max;
343
      csc->slab_buffers = new_buffers;
344
   }
345

346
   /* Initialize the new relocation. */
347
   idx = csc->num_slab_buffers++;
348
   item = &csc->slab_buffers[idx];
349

350
   item->bo = NULL;
351
   item->u.slab.real_idx = real_idx;
352
   radeon_ws_bo_reference(&item->bo, bo);
353
   p_atomic_inc(&bo->num_cs_references);
354

355
   hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356
   csc->reloc_indices_hashlist[hash] = idx;
357

358
   return idx;
359
}
360

361
static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362
                                         struct pb_buffer *buf,
363
                                         enum radeon_bo_usage usage,
364
                                         enum radeon_bo_domain domains,
365
                                         enum radeon_bo_priority priority)
366
{
367
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
368
   struct radeon_bo *bo = (struct radeon_bo*)buf;
369
   enum radeon_bo_domain added_domains;
370

371
   /* If VRAM is just stolen system memory, allow both VRAM and
372
    * GTT, whichever has free space. If a buffer is evicted from
373
    * VRAM to GTT, it will stay there.
374
    */
375
   if (!cs->ws->info.has_dedicated_vram)
376
      domains |= RADEON_DOMAIN_GTT;
377

378
   enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
379
   enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
380
   struct drm_radeon_cs_reloc *reloc;
381
   int index;
382

383
   if (!bo->handle) {
384
      index = radeon_lookup_or_add_slab_buffer(cs, bo);
385
      if (index < 0)
386
         return 0;
387

388
      index = cs->csc->slab_buffers[index].u.slab.real_idx;
389
   } else {
390
      index = radeon_lookup_or_add_real_buffer(cs, bo);
391
   }
392

393
   reloc = &cs->csc->relocs[index];
394
   added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
395
   reloc->read_domains |= rd;
396
   reloc->write_domain |= wd;
397
   reloc->flags = MAX2(reloc->flags, priority);
398
   cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
399

400
   if (added_domains & RADEON_DOMAIN_VRAM)
401
      rcs->used_vram_kb += bo->base.size / 1024;
402
   else if (added_domains & RADEON_DOMAIN_GTT)
403
      rcs->used_gart_kb += bo->base.size / 1024;
404

405
   return index;
406
}
407

408
static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
409
                                       struct pb_buffer *buf)
410
{
411
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
412

413
   return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
414
}
415

416
static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
417
{
418
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
419
   bool status =
420
         rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
421
         rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
422

423
   if (status) {
424
      cs->csc->num_validated_relocs = cs->csc->num_relocs;
425
   } else {
426
      /* Remove lately-added buffers. The validation failed with them
427
       * and the CS is about to be flushed because of that. Keep only
428
       * the already-validated buffers. */
429
      unsigned i;
430

431
      for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
432
         p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
433
         radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
434
      }
435
      cs->csc->num_relocs = cs->csc->num_validated_relocs;
436

437
      /* Flush if there are any relocs. Clean up otherwise. */
438
      if (cs->csc->num_relocs) {
439
         cs->flush_cs(cs->flush_data,
440
                      RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
441
      } else {
442
         radeon_cs_context_cleanup(cs->csc);
443
         rcs->used_vram_kb = 0;
444
         rcs->used_gart_kb = 0;
445

446
         assert(rcs->current.cdw == 0);
447
         if (rcs->current.cdw != 0) {
448
            fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
449
         }
450
      }
451
   }
452
   return status;
453
}
454

455
static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
456
                                      bool force_chaining)
457
{
458
   assert(rcs->current.cdw <= rcs->current.max_dw);
459
   return rcs->current.max_dw - rcs->current.cdw >= dw;
460
}
461

462
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
463
                                              struct radeon_bo_list_item *list)
464
{
465
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466
   int i;
467

468
   if (list) {
469
      for (i = 0; i < cs->csc->num_relocs; i++) {
470
         list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
471
         list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
472
         list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
473
      }
474
   }
475
   return cs->csc->num_relocs;
476
}
477

478
void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
479
{
480
   struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
481
   unsigned i;
482
   int r;
483

484
   r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
485
                           &csc->cs, sizeof(struct drm_radeon_cs));
486
   if (r) {
487
      if (r == -ENOMEM)
488
         fprintf(stderr, "radeon: Not enough memory for command submission.\n");
489
      else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
490
         unsigned i;
491

492
         fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
493
         for (i = 0; i < csc->chunks[0].length_dw; i++) {
494
            fprintf(stderr, "0x%08X\n", csc->buf[i]);
495
         }
496
      } else {
497
         fprintf(stderr, "radeon: The kernel rejected CS, "
498
                         "see dmesg for more information (%i).\n", r);
499
      }
500
   }
501

502
   for (i = 0; i < csc->num_relocs; i++)
503
      p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
504
   for (i = 0; i < csc->num_slab_buffers; i++)
505
      p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
506

507
   radeon_cs_context_cleanup(csc);
508
}
509

510
/*
511
 * Make sure previous submission of this cs are completed
512
 */
513
void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
514
{
515
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516

517
   /* Wait for any pending ioctl of this CS to complete. */
518
   if (util_queue_is_initialized(&cs->ws->cs_queue))
519
      util_queue_fence_wait(&cs->flush_completed);
520
}
521

522
/* Add the given fence to a slab buffer fence list.
523
 *
524
 * There is a potential race condition when bo participates in submissions on
525
 * two or more threads simultaneously. Since we do not know which of the
526
 * submissions will be sent to the GPU first, we have to keep the fences
527
 * of all submissions.
528
 *
529
 * However, fences that belong to submissions that have already returned from
530
 * their respective ioctl do not have to be kept, because we know that they
531
 * will signal earlier.
532
 */
533
static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
534
{
535
   unsigned dst;
536

537
   assert(fence->num_cs_references);
538

539
   /* Cleanup older fences */
540
   dst = 0;
541
   for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
542
      if (bo->u.slab.fences[src]->num_cs_references) {
543
         bo->u.slab.fences[dst] = bo->u.slab.fences[src];
544
         dst++;
545
      } else {
546
         radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
547
      }
548
   }
549
   bo->u.slab.num_fences = dst;
550

551
   /* Check available space for the new fence */
552
   if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
553
      unsigned new_max_fences = bo->u.slab.max_fences + 1;
554
      struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
555
                                              bo->u.slab.max_fences * sizeof(*new_fences),
556
                                              new_max_fences * sizeof(*new_fences));
557
      if (!new_fences) {
558
         fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
559
         return;
560
      }
561

562
      bo->u.slab.fences = new_fences;
563
      bo->u.slab.max_fences = new_max_fences;
564
   }
565

566
   /* Add the new fence */
567
   bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
568
   radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
569
   bo->u.slab.num_fences++;
570
}
571

572
static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
573
                               unsigned flags,
574
                               struct pipe_fence_handle **pfence)
575
{
576
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
577
   struct radeon_cs_context *tmp;
578

579
   switch (cs->ring_type) {
580
   case RING_DMA:
581
      /* pad DMA ring to 8 DWs */
582
      if (cs->ws->info.chip_class <= GFX6) {
583
         while (rcs->current.cdw & 7)
584
            radeon_emit(rcs, 0xf0000000); /* NOP packet */
585
      } else {
586
         while (rcs->current.cdw & 7)
587
            radeon_emit(rcs, 0x00000000); /* NOP packet */
588
      }
589
      break;
590
   case RING_GFX:
591
      /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
592
       * r6xx, requires at least 4 dw alignment to avoid a hw bug.
593
       */
594
      if (cs->ws->info.gfx_ib_pad_with_type2) {
595
         while (rcs->current.cdw & 7)
596
            radeon_emit(rcs, 0x80000000); /* type2 nop packet */
597
      } else {
598
         while (rcs->current.cdw & 7)
599
            radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
600
      }
601
      break;
602
   case RING_UVD:
603
      while (rcs->current.cdw & 15)
604
         radeon_emit(rcs, 0x80000000); /* type2 nop packet */
605
      break;
606
   default:
607
      break;
608
   }
609

610
   if (rcs->current.cdw > rcs->current.max_dw) {
611
      fprintf(stderr, "radeon: command stream overflowed\n");
612
   }
613

614
   if (pfence || cs->csc->num_slab_buffers) {
615
      struct pipe_fence_handle *fence;
616

617
      if (cs->next_fence) {
618
         fence = cs->next_fence;
619
         cs->next_fence = NULL;
620
      } else {
621
         fence = radeon_cs_create_fence(rcs);
622
      }
623

624
      if (fence) {
625
         if (pfence)
626
            radeon_fence_reference(pfence, fence);
627

628
         mtx_lock(&cs->ws->bo_fence_lock);
629
         for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
630
            struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
631
            p_atomic_inc(&bo->num_active_ioctls);
632
            radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
633
         }
634
         mtx_unlock(&cs->ws->bo_fence_lock);
635

636
         radeon_fence_reference(&fence, NULL);
637
      }
638
   } else {
639
      radeon_fence_reference(&cs->next_fence, NULL);
640
   }
641

642
   radeon_drm_cs_sync_flush(rcs);
643

644
   /* Swap command streams. */
645
   tmp = cs->csc;
646
   cs->csc = cs->cst;
647
   cs->cst = tmp;
648

649
   /* If the CS is not empty or overflowed, emit it in a separate thread. */
650
   if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
651
       !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
652
      unsigned i, num_relocs;
653

654
      num_relocs = cs->cst->num_relocs;
655

656
      cs->cst->chunks[0].length_dw = rcs->current.cdw;
657

658
      for (i = 0; i < num_relocs; i++) {
659
         /* Update the number of active asynchronous CS ioctls for the buffer. */
660
         p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
661
      }
662

663
      switch (cs->ring_type) {
664
      case RING_DMA:
665
         cs->cst->flags[0] = 0;
666
         cs->cst->flags[1] = RADEON_CS_RING_DMA;
667
         cs->cst->cs.num_chunks = 3;
668
         if (cs->ws->info.r600_has_virtual_memory) {
669
            cs->cst->flags[0] |= RADEON_CS_USE_VM;
670
         }
671
         break;
672

673
      case RING_UVD:
674
         cs->cst->flags[0] = 0;
675
         cs->cst->flags[1] = RADEON_CS_RING_UVD;
676
         cs->cst->cs.num_chunks = 3;
677
         break;
678

679
      case RING_VCE:
680
         cs->cst->flags[0] = 0;
681
         cs->cst->flags[1] = RADEON_CS_RING_VCE;
682
         cs->cst->cs.num_chunks = 3;
683
         break;
684

685
      default:
686
      case RING_GFX:
687
      case RING_COMPUTE:
688
         cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
689
         cs->cst->flags[1] = RADEON_CS_RING_GFX;
690
         cs->cst->cs.num_chunks = 3;
691

692
         if (cs->ws->info.r600_has_virtual_memory) {
693
            cs->cst->flags[0] |= RADEON_CS_USE_VM;
694
            cs->cst->cs.num_chunks = 3;
695
         }
696
         if (flags & PIPE_FLUSH_END_OF_FRAME) {
697
            cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
698
            cs->cst->cs.num_chunks = 3;
699
         }
700
         if (cs->ring_type == RING_COMPUTE) {
701
            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
702
            cs->cst->cs.num_chunks = 3;
703
         }
704
         break;
705
      }
706

707
      if (util_queue_is_initialized(&cs->ws->cs_queue)) {
708
         util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
709
                            radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
710
         if (!(flags & PIPE_FLUSH_ASYNC))
711
            radeon_drm_cs_sync_flush(rcs);
712
      } else {
713
         radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
714
      }
715
   } else {
716
      radeon_cs_context_cleanup(cs->cst);
717
   }
718

719
   /* Prepare a new CS. */
720
   rcs->current.buf = cs->csc->buf;
721
   rcs->current.cdw = 0;
722
   rcs->used_vram_kb = 0;
723
   rcs->used_gart_kb = 0;
724

725
   if (cs->ring_type == RING_GFX)
726
      cs->ws->num_gfx_IBs++;
727
   else if (cs->ring_type == RING_DMA)
728
      cs->ws->num_sdma_IBs++;
729
   return 0;
730
}
731

732
static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
733
{
734
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
735

736
   if (!cs)
737
      return;
738

739
   radeon_drm_cs_sync_flush(rcs);
740
   util_queue_fence_destroy(&cs->flush_completed);
741
   radeon_cs_context_cleanup(&cs->csc1);
742
   radeon_cs_context_cleanup(&cs->csc2);
743
   p_atomic_dec(&cs->ws->num_cs);
744
   radeon_destroy_cs_context(&cs->csc1);
745
   radeon_destroy_cs_context(&cs->csc2);
746
   radeon_fence_reference(&cs->next_fence, NULL);
747
   FREE(cs);
748
}
749

750
static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
751
                                    struct pb_buffer *_buf,
752
                                    enum radeon_bo_usage usage)
753
{
754
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
755
   struct radeon_bo *bo = (struct radeon_bo*)_buf;
756
   int index;
757

758
   if (!bo->num_cs_references)
759
      return false;
760

761
   index = radeon_lookup_buffer(cs->csc, bo);
762
   if (index == -1)
763
      return false;
764

765
   if (!bo->handle)
766
      index = cs->csc->slab_buffers[index].u.slab.real_idx;
767

768
   if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
769
      return true;
770
   if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
771
      return true;
772

773
   return false;
774
}
775

776
/* FENCES */
777

778
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
779
{
780
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
781
   struct pb_buffer *fence;
782

783
   /* Create a fence, which is a dummy BO. */
784
   fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
785
                                      RADEON_DOMAIN_GTT,
786
                                      RADEON_FLAG_NO_SUBALLOC
787
                                      | RADEON_FLAG_NO_INTERPROCESS_SHARING);
788
   if (!fence)
789
      return NULL;
790

791
   /* Add the fence as a dummy relocation. */
792
   cs->ws->base.cs_add_buffer(rcs, fence,
793
                              RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
794
                              RADEON_PRIO_FENCE);
795
   return (struct pipe_fence_handle*)fence;
796
}
797

798
static bool radeon_fence_wait(struct radeon_winsys *ws,
799
                              struct pipe_fence_handle *fence,
800
                              uint64_t timeout)
801
{
802
   return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
803
                          RADEON_USAGE_READWRITE);
804
}
805

806
static void radeon_fence_reference(struct pipe_fence_handle **dst,
807
                                   struct pipe_fence_handle *src)
808
{
809
   pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
810
}
811

812
static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
813
{
814
   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
815
   struct pipe_fence_handle *fence = NULL;
816

817
   if (cs->next_fence) {
818
      radeon_fence_reference(&fence, cs->next_fence);
819
      return fence;
820
   }
821

822
   fence = radeon_cs_create_fence(rcs);
823
   if (!fence)
824
      return NULL;
825

826
   radeon_fence_reference(&cs->next_fence, fence);
827
   return fence;
828
}
829

830
static void
831
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
832
                                   struct pipe_fence_handle *fence,
833
                                   unsigned dependency_flags)
834
{
835
   /* TODO: Handle the following unlikely multi-threaded scenario:
836
    *
837
    *  Thread 1 / Context 1                   Thread 2 / Context 2
838
    *  --------------------                   --------------------
839
    *  f = cs_get_next_fence()
840
    *                                         cs_add_fence_dependency(f)
841
    *                                         cs_flush()
842
    *  cs_flush()
843
    *
844
    * We currently assume that this does not happen because we don't support
845
    * asynchronous flushes on Radeon.
846
    */
847
}
848

849
void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
850
{
851
   ws->base.ctx_create = radeon_drm_ctx_create;
852
   ws->base.ctx_destroy = radeon_drm_ctx_destroy;
853
   ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
854
   ws->base.cs_create = radeon_drm_cs_create;
855
   ws->base.cs_destroy = radeon_drm_cs_destroy;
856
   ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
857
   ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
858
   ws->base.cs_validate = radeon_drm_cs_validate;
859
   ws->base.cs_check_space = radeon_drm_cs_check_space;
860
   ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
861
   ws->base.cs_flush = radeon_drm_cs_flush;
862
   ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
863
   ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
864
   ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
865
   ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
866
   ws->base.fence_wait = radeon_fence_wait;
867
   ws->base.fence_reference = radeon_fence_reference;
868
}
869

870
Product

Resources

Company