Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
26517 views
1
/*
2
* Copyright 2008 Jerome Glisse.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
* DEALINGS IN THE SOFTWARE.
23
*
24
* Authors:
25
* Jerome Glisse <[email protected]>
26
*/
27
28
#include <linux/file.h>
29
#include <linux/pagemap.h>
30
#include <linux/sync_file.h>
31
#include <linux/dma-buf.h>
32
33
#include <drm/amdgpu_drm.h>
34
#include <drm/drm_syncobj.h>
35
#include <drm/ttm/ttm_tt.h>
36
37
#include "amdgpu_cs.h"
38
#include "amdgpu.h"
39
#include "amdgpu_trace.h"
40
#include "amdgpu_gmc.h"
41
#include "amdgpu_gem.h"
42
#include "amdgpu_ras.h"
43
44
static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
45
struct amdgpu_device *adev,
46
struct drm_file *filp,
47
union drm_amdgpu_cs *cs)
48
{
49
struct amdgpu_fpriv *fpriv = filp->driver_priv;
50
51
if (cs->in.num_chunks == 0)
52
return -EINVAL;
53
54
memset(p, 0, sizeof(*p));
55
p->adev = adev;
56
p->filp = filp;
57
58
p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
59
if (!p->ctx)
60
return -EINVAL;
61
62
if (atomic_read(&p->ctx->guilty)) {
63
amdgpu_ctx_put(p->ctx);
64
return -ECANCELED;
65
}
66
67
amdgpu_sync_create(&p->sync);
68
drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
69
DRM_EXEC_IGNORE_DUPLICATES, 0);
70
return 0;
71
}
72
73
static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
74
struct drm_amdgpu_cs_chunk_ib *chunk_ib)
75
{
76
struct drm_sched_entity *entity;
77
unsigned int i;
78
int r;
79
80
r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
81
chunk_ib->ip_instance,
82
chunk_ib->ring, &entity);
83
if (r)
84
return r;
85
86
/*
87
* Abort if there is no run queue associated with this entity.
88
* Possibly because of disabled HW IP.
89
*/
90
if (entity->rq == NULL)
91
return -EINVAL;
92
93
/* Check if we can add this IB to some existing job */
94
for (i = 0; i < p->gang_size; ++i)
95
if (p->entities[i] == entity)
96
return i;
97
98
/* If not increase the gang size if possible */
99
if (i == AMDGPU_CS_GANG_SIZE)
100
return -EINVAL;
101
102
p->entities[i] = entity;
103
p->gang_size = i + 1;
104
return i;
105
}
106
107
static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
108
struct drm_amdgpu_cs_chunk_ib *chunk_ib,
109
unsigned int *num_ibs)
110
{
111
int r;
112
113
r = amdgpu_cs_job_idx(p, chunk_ib);
114
if (r < 0)
115
return r;
116
117
if (num_ibs[r] >= amdgpu_ring_max_ibs(chunk_ib->ip_type))
118
return -EINVAL;
119
120
++(num_ibs[r]);
121
p->gang_leader_idx = r;
122
return 0;
123
}
124
125
static int amdgpu_cs_p1_user_fence(struct amdgpu_cs_parser *p,
126
struct drm_amdgpu_cs_chunk_fence *data,
127
uint32_t *offset)
128
{
129
struct drm_gem_object *gobj;
130
unsigned long size;
131
132
gobj = drm_gem_object_lookup(p->filp, data->handle);
133
if (gobj == NULL)
134
return -EINVAL;
135
136
p->uf_bo = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj));
137
drm_gem_object_put(gobj);
138
139
size = amdgpu_bo_size(p->uf_bo);
140
if (size != PAGE_SIZE || data->offset > (size - 8))
141
return -EINVAL;
142
143
if (amdgpu_ttm_tt_get_usermm(p->uf_bo->tbo.ttm))
144
return -EINVAL;
145
146
*offset = data->offset;
147
return 0;
148
}
149
150
static int amdgpu_cs_p1_bo_handles(struct amdgpu_cs_parser *p,
151
struct drm_amdgpu_bo_list_in *data)
152
{
153
struct drm_amdgpu_bo_list_entry *info;
154
int r;
155
156
r = amdgpu_bo_create_list_entry_array(data, &info);
157
if (r)
158
return r;
159
160
r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number,
161
&p->bo_list);
162
if (r)
163
goto error_free;
164
165
kvfree(info);
166
return 0;
167
168
error_free:
169
kvfree(info);
170
171
return r;
172
}
173
174
/* Copy the data from userspace and go over it the first time */
175
static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
176
union drm_amdgpu_cs *cs)
177
{
178
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
179
unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
180
struct amdgpu_vm *vm = &fpriv->vm;
181
uint64_t *chunk_array_user;
182
uint64_t *chunk_array;
183
uint32_t uf_offset = 0;
184
size_t size;
185
int ret;
186
int i;
187
188
chunk_array = kvmalloc_array(cs->in.num_chunks, sizeof(uint64_t),
189
GFP_KERNEL);
190
if (!chunk_array)
191
return -ENOMEM;
192
193
/* get chunks */
194
chunk_array_user = u64_to_user_ptr(cs->in.chunks);
195
if (copy_from_user(chunk_array, chunk_array_user,
196
sizeof(uint64_t)*cs->in.num_chunks)) {
197
ret = -EFAULT;
198
goto free_chunk;
199
}
200
201
p->nchunks = cs->in.num_chunks;
202
p->chunks = kvmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk),
203
GFP_KERNEL);
204
if (!p->chunks) {
205
ret = -ENOMEM;
206
goto free_chunk;
207
}
208
209
for (i = 0; i < p->nchunks; i++) {
210
struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL;
211
struct drm_amdgpu_cs_chunk user_chunk;
212
uint32_t __user *cdata;
213
214
chunk_ptr = u64_to_user_ptr(chunk_array[i]);
215
if (copy_from_user(&user_chunk, chunk_ptr,
216
sizeof(struct drm_amdgpu_cs_chunk))) {
217
ret = -EFAULT;
218
i--;
219
goto free_partial_kdata;
220
}
221
p->chunks[i].chunk_id = user_chunk.chunk_id;
222
p->chunks[i].length_dw = user_chunk.length_dw;
223
224
size = p->chunks[i].length_dw;
225
cdata = u64_to_user_ptr(user_chunk.chunk_data);
226
227
p->chunks[i].kdata = kvmalloc_array(size, sizeof(uint32_t),
228
GFP_KERNEL);
229
if (p->chunks[i].kdata == NULL) {
230
ret = -ENOMEM;
231
i--;
232
goto free_partial_kdata;
233
}
234
size *= sizeof(uint32_t);
235
if (copy_from_user(p->chunks[i].kdata, cdata, size)) {
236
ret = -EFAULT;
237
goto free_partial_kdata;
238
}
239
240
/* Assume the worst on the following checks */
241
ret = -EINVAL;
242
switch (p->chunks[i].chunk_id) {
243
case AMDGPU_CHUNK_ID_IB:
244
if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
245
goto free_partial_kdata;
246
247
ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
248
if (ret)
249
goto free_partial_kdata;
250
break;
251
252
case AMDGPU_CHUNK_ID_FENCE:
253
if (size < sizeof(struct drm_amdgpu_cs_chunk_fence))
254
goto free_partial_kdata;
255
256
ret = amdgpu_cs_p1_user_fence(p, p->chunks[i].kdata,
257
&uf_offset);
258
if (ret)
259
goto free_partial_kdata;
260
break;
261
262
case AMDGPU_CHUNK_ID_BO_HANDLES:
263
if (size < sizeof(struct drm_amdgpu_bo_list_in))
264
goto free_partial_kdata;
265
266
/* Only a single BO list is allowed to simplify handling. */
267
if (p->bo_list)
268
goto free_partial_kdata;
269
270
ret = amdgpu_cs_p1_bo_handles(p, p->chunks[i].kdata);
271
if (ret)
272
goto free_partial_kdata;
273
break;
274
275
case AMDGPU_CHUNK_ID_DEPENDENCIES:
276
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
277
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
278
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
279
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
280
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
281
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
282
break;
283
284
default:
285
goto free_partial_kdata;
286
}
287
}
288
289
if (!p->gang_size) {
290
ret = -EINVAL;
291
goto free_all_kdata;
292
}
293
294
for (i = 0; i < p->gang_size; ++i) {
295
ret = amdgpu_job_alloc(p->adev, vm, p->entities[i], vm,
296
num_ibs[i], &p->jobs[i],
297
p->filp->client_id);
298
if (ret)
299
goto free_all_kdata;
300
switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
301
case AMDGPU_ENFORCE_ISOLATION_DISABLE:
302
default:
303
p->jobs[i]->enforce_isolation = false;
304
p->jobs[i]->run_cleaner_shader = false;
305
break;
306
case AMDGPU_ENFORCE_ISOLATION_ENABLE:
307
p->jobs[i]->enforce_isolation = true;
308
p->jobs[i]->run_cleaner_shader = true;
309
break;
310
case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
311
p->jobs[i]->enforce_isolation = true;
312
p->jobs[i]->run_cleaner_shader = false;
313
break;
314
case AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER:
315
p->jobs[i]->enforce_isolation = true;
316
p->jobs[i]->run_cleaner_shader = false;
317
break;
318
}
319
}
320
p->gang_leader = p->jobs[p->gang_leader_idx];
321
322
if (p->ctx->generation != p->gang_leader->generation) {
323
ret = -ECANCELED;
324
goto free_all_kdata;
325
}
326
327
if (p->uf_bo)
328
p->gang_leader->uf_addr = uf_offset;
329
kvfree(chunk_array);
330
331
/* Use this opportunity to fill in task info for the vm */
332
amdgpu_vm_set_task_info(vm);
333
334
return 0;
335
336
free_all_kdata:
337
i = p->nchunks - 1;
338
free_partial_kdata:
339
for (; i >= 0; i--)
340
kvfree(p->chunks[i].kdata);
341
kvfree(p->chunks);
342
p->chunks = NULL;
343
p->nchunks = 0;
344
free_chunk:
345
kvfree(chunk_array);
346
347
return ret;
348
}
349
350
static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
351
struct amdgpu_cs_chunk *chunk,
352
unsigned int *ce_preempt,
353
unsigned int *de_preempt)
354
{
355
struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
356
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
357
struct amdgpu_vm *vm = &fpriv->vm;
358
struct amdgpu_ring *ring;
359
struct amdgpu_job *job;
360
struct amdgpu_ib *ib;
361
int r;
362
363
r = amdgpu_cs_job_idx(p, chunk_ib);
364
if (r < 0)
365
return r;
366
367
job = p->jobs[r];
368
ring = amdgpu_job_ring(job);
369
ib = &job->ibs[job->num_ibs++];
370
371
/* submissions to kernel queues are disabled */
372
if (ring->no_user_submission)
373
return -EINVAL;
374
375
/* MM engine doesn't support user fences */
376
if (p->uf_bo && ring->funcs->no_user_fence)
377
return -EINVAL;
378
379
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
380
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
381
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
382
(*ce_preempt)++;
383
else
384
(*de_preempt)++;
385
386
/* Each GFX command submit allows only 1 IB max
387
* preemptible for CE & DE */
388
if (*ce_preempt > 1 || *de_preempt > 1)
389
return -EINVAL;
390
}
391
392
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
393
job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
394
395
r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
396
chunk_ib->ib_bytes : 0,
397
AMDGPU_IB_POOL_DELAYED, ib);
398
if (r) {
399
DRM_ERROR("Failed to get ib !\n");
400
return r;
401
}
402
403
ib->gpu_addr = chunk_ib->va_start;
404
ib->length_dw = chunk_ib->ib_bytes / 4;
405
ib->flags = chunk_ib->flags;
406
return 0;
407
}
408
409
static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
410
struct amdgpu_cs_chunk *chunk)
411
{
412
struct drm_amdgpu_cs_chunk_dep *deps = chunk->kdata;
413
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
414
unsigned int num_deps;
415
int i, r;
416
417
num_deps = chunk->length_dw * 4 /
418
sizeof(struct drm_amdgpu_cs_chunk_dep);
419
420
for (i = 0; i < num_deps; ++i) {
421
struct amdgpu_ctx *ctx;
422
struct drm_sched_entity *entity;
423
struct dma_fence *fence;
424
425
ctx = amdgpu_ctx_get(fpriv, deps[i].ctx_id);
426
if (ctx == NULL)
427
return -EINVAL;
428
429
r = amdgpu_ctx_get_entity(ctx, deps[i].ip_type,
430
deps[i].ip_instance,
431
deps[i].ring, &entity);
432
if (r) {
433
amdgpu_ctx_put(ctx);
434
return r;
435
}
436
437
fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle);
438
amdgpu_ctx_put(ctx);
439
440
if (IS_ERR(fence))
441
return PTR_ERR(fence);
442
else if (!fence)
443
continue;
444
445
if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
446
struct drm_sched_fence *s_fence;
447
struct dma_fence *old = fence;
448
449
s_fence = to_drm_sched_fence(fence);
450
fence = dma_fence_get(&s_fence->scheduled);
451
dma_fence_put(old);
452
}
453
454
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
455
dma_fence_put(fence);
456
if (r)
457
return r;
458
}
459
return 0;
460
}
461
462
static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
463
uint32_t handle, u64 point,
464
u64 flags)
465
{
466
struct dma_fence *fence;
467
int r;
468
469
r = drm_syncobj_find_fence(p->filp, handle, point, flags, &fence);
470
if (r) {
471
DRM_ERROR("syncobj %u failed to find fence @ %llu (%d)!\n",
472
handle, point, r);
473
return r;
474
}
475
476
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
477
dma_fence_put(fence);
478
return r;
479
}
480
481
static int amdgpu_cs_p2_syncobj_in(struct amdgpu_cs_parser *p,
482
struct amdgpu_cs_chunk *chunk)
483
{
484
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
485
unsigned int num_deps;
486
int i, r;
487
488
num_deps = chunk->length_dw * 4 /
489
sizeof(struct drm_amdgpu_cs_chunk_sem);
490
for (i = 0; i < num_deps; ++i) {
491
r = amdgpu_syncobj_lookup_and_add(p, deps[i].handle, 0, 0);
492
if (r)
493
return r;
494
}
495
496
return 0;
497
}
498
499
static int amdgpu_cs_p2_syncobj_timeline_wait(struct amdgpu_cs_parser *p,
500
struct amdgpu_cs_chunk *chunk)
501
{
502
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
503
unsigned int num_deps;
504
int i, r;
505
506
num_deps = chunk->length_dw * 4 /
507
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
508
for (i = 0; i < num_deps; ++i) {
509
r = amdgpu_syncobj_lookup_and_add(p, syncobj_deps[i].handle,
510
syncobj_deps[i].point,
511
syncobj_deps[i].flags);
512
if (r)
513
return r;
514
}
515
516
return 0;
517
}
518
519
static int amdgpu_cs_p2_syncobj_out(struct amdgpu_cs_parser *p,
520
struct amdgpu_cs_chunk *chunk)
521
{
522
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
523
unsigned int num_deps;
524
int i;
525
526
num_deps = chunk->length_dw * 4 /
527
sizeof(struct drm_amdgpu_cs_chunk_sem);
528
529
if (p->post_deps)
530
return -EINVAL;
531
532
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
533
GFP_KERNEL);
534
p->num_post_deps = 0;
535
536
if (!p->post_deps)
537
return -ENOMEM;
538
539
540
for (i = 0; i < num_deps; ++i) {
541
p->post_deps[i].syncobj =
542
drm_syncobj_find(p->filp, deps[i].handle);
543
if (!p->post_deps[i].syncobj)
544
return -EINVAL;
545
p->post_deps[i].chain = NULL;
546
p->post_deps[i].point = 0;
547
p->num_post_deps++;
548
}
549
550
return 0;
551
}
552
553
static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
554
struct amdgpu_cs_chunk *chunk)
555
{
556
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
557
unsigned int num_deps;
558
int i;
559
560
num_deps = chunk->length_dw * 4 /
561
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
562
563
if (p->post_deps)
564
return -EINVAL;
565
566
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
567
GFP_KERNEL);
568
p->num_post_deps = 0;
569
570
if (!p->post_deps)
571
return -ENOMEM;
572
573
for (i = 0; i < num_deps; ++i) {
574
struct amdgpu_cs_post_dep *dep = &p->post_deps[i];
575
576
dep->chain = NULL;
577
if (syncobj_deps[i].point) {
578
dep->chain = dma_fence_chain_alloc();
579
if (!dep->chain)
580
return -ENOMEM;
581
}
582
583
dep->syncobj = drm_syncobj_find(p->filp,
584
syncobj_deps[i].handle);
585
if (!dep->syncobj) {
586
dma_fence_chain_free(dep->chain);
587
return -EINVAL;
588
}
589
dep->point = syncobj_deps[i].point;
590
p->num_post_deps++;
591
}
592
593
return 0;
594
}
595
596
static int amdgpu_cs_p2_shadow(struct amdgpu_cs_parser *p,
597
struct amdgpu_cs_chunk *chunk)
598
{
599
struct drm_amdgpu_cs_chunk_cp_gfx_shadow *shadow = chunk->kdata;
600
int i;
601
602
if (shadow->flags & ~AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW)
603
return -EINVAL;
604
605
for (i = 0; i < p->gang_size; ++i) {
606
p->jobs[i]->shadow_va = shadow->shadow_va;
607
p->jobs[i]->csa_va = shadow->csa_va;
608
p->jobs[i]->gds_va = shadow->gds_va;
609
p->jobs[i]->init_shadow =
610
shadow->flags & AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
611
}
612
613
return 0;
614
}
615
616
static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
617
{
618
unsigned int ce_preempt = 0, de_preempt = 0;
619
int i, r;
620
621
for (i = 0; i < p->nchunks; ++i) {
622
struct amdgpu_cs_chunk *chunk;
623
624
chunk = &p->chunks[i];
625
626
switch (chunk->chunk_id) {
627
case AMDGPU_CHUNK_ID_IB:
628
r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
629
if (r)
630
return r;
631
break;
632
case AMDGPU_CHUNK_ID_DEPENDENCIES:
633
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
634
r = amdgpu_cs_p2_dependencies(p, chunk);
635
if (r)
636
return r;
637
break;
638
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
639
r = amdgpu_cs_p2_syncobj_in(p, chunk);
640
if (r)
641
return r;
642
break;
643
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
644
r = amdgpu_cs_p2_syncobj_out(p, chunk);
645
if (r)
646
return r;
647
break;
648
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
649
r = amdgpu_cs_p2_syncobj_timeline_wait(p, chunk);
650
if (r)
651
return r;
652
break;
653
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
654
r = amdgpu_cs_p2_syncobj_timeline_signal(p, chunk);
655
if (r)
656
return r;
657
break;
658
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
659
r = amdgpu_cs_p2_shadow(p, chunk);
660
if (r)
661
return r;
662
break;
663
}
664
}
665
666
return 0;
667
}
668
669
/* Convert microseconds to bytes. */
670
static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
671
{
672
if (us <= 0 || !adev->mm_stats.log2_max_MBps)
673
return 0;
674
675
/* Since accum_us is incremented by a million per second, just
676
* multiply it by the number of MB/s to get the number of bytes.
677
*/
678
return us << adev->mm_stats.log2_max_MBps;
679
}
680
681
static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
682
{
683
if (!adev->mm_stats.log2_max_MBps)
684
return 0;
685
686
return bytes >> adev->mm_stats.log2_max_MBps;
687
}
688
689
/* Returns how many bytes TTM can move right now. If no bytes can be moved,
690
* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
691
* which means it can go over the threshold once. If that happens, the driver
692
* will be in debt and no other buffer migrations can be done until that debt
693
* is repaid.
694
*
695
* This approach allows moving a buffer of any size (it's important to allow
696
* that).
697
*
698
* The currency is simply time in microseconds and it increases as the clock
699
* ticks. The accumulated microseconds (us) are converted to bytes and
700
* returned.
701
*/
702
static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
703
u64 *max_bytes,
704
u64 *max_vis_bytes)
705
{
706
s64 time_us, increment_us;
707
u64 free_vram, total_vram, used_vram;
708
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
709
* throttling.
710
*
711
* It means that in order to get full max MBps, at least 5 IBs per
712
* second must be submitted and not more than 200ms apart from each
713
* other.
714
*/
715
const s64 us_upper_bound = 200000;
716
717
if (!adev->mm_stats.log2_max_MBps) {
718
*max_bytes = 0;
719
*max_vis_bytes = 0;
720
return;
721
}
722
723
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
724
used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
725
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
726
727
spin_lock(&adev->mm_stats.lock);
728
729
/* Increase the amount of accumulated us. */
730
time_us = ktime_to_us(ktime_get());
731
increment_us = time_us - adev->mm_stats.last_update_us;
732
adev->mm_stats.last_update_us = time_us;
733
adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
734
us_upper_bound);
735
736
/* This prevents the short period of low performance when the VRAM
737
* usage is low and the driver is in debt or doesn't have enough
738
* accumulated us to fill VRAM quickly.
739
*
740
* The situation can occur in these cases:
741
* - a lot of VRAM is freed by userspace
742
* - the presence of a big buffer causes a lot of evictions
743
* (solution: split buffers into smaller ones)
744
*
745
* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
746
* accum_us to a positive number.
747
*/
748
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
749
s64 min_us;
750
751
/* Be more aggressive on dGPUs. Try to fill a portion of free
752
* VRAM now.
753
*/
754
if (!(adev->flags & AMD_IS_APU))
755
min_us = bytes_to_us(adev, free_vram / 4);
756
else
757
min_us = 0; /* Reset accum_us on APUs. */
758
759
adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
760
}
761
762
/* This is set to 0 if the driver is in debt to disallow (optional)
763
* buffer moves.
764
*/
765
*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
766
767
/* Do the same for visible VRAM if half of it is free */
768
if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
769
u64 total_vis_vram = adev->gmc.visible_vram_size;
770
u64 used_vis_vram =
771
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
772
773
if (used_vis_vram < total_vis_vram) {
774
u64 free_vis_vram = total_vis_vram - used_vis_vram;
775
776
adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
777
increment_us, us_upper_bound);
778
779
if (free_vis_vram >= total_vis_vram / 2)
780
adev->mm_stats.accum_us_vis =
781
max(bytes_to_us(adev, free_vis_vram / 2),
782
adev->mm_stats.accum_us_vis);
783
}
784
785
*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
786
} else {
787
*max_vis_bytes = 0;
788
}
789
790
spin_unlock(&adev->mm_stats.lock);
791
}
792
793
/* Report how many bytes have really been moved for the last command
794
* submission. This can result in a debt that can stop buffer migrations
795
* temporarily.
796
*/
797
void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
798
u64 num_vis_bytes)
799
{
800
spin_lock(&adev->mm_stats.lock);
801
adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
802
adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
803
spin_unlock(&adev->mm_stats.lock);
804
}
805
806
static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
807
{
808
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
809
struct amdgpu_cs_parser *p = param;
810
struct ttm_operation_ctx ctx = {
811
.interruptible = true,
812
.no_wait_gpu = false,
813
.resv = bo->tbo.base.resv
814
};
815
uint32_t domain;
816
int r;
817
818
if (bo->tbo.pin_count)
819
return 0;
820
821
/* Don't move this buffer if we have depleted our allowance
822
* to move it. Don't move anything if the threshold is zero.
823
*/
824
if (p->bytes_moved < p->bytes_moved_threshold &&
825
(!bo->tbo.base.dma_buf ||
826
list_empty(&bo->tbo.base.dma_buf->attachments))) {
827
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
828
(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
829
/* And don't move a CPU_ACCESS_REQUIRED BO to limited
830
* visible VRAM if we've depleted our allowance to do
831
* that.
832
*/
833
if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
834
domain = bo->preferred_domains;
835
else
836
domain = bo->allowed_domains;
837
} else {
838
domain = bo->preferred_domains;
839
}
840
} else {
841
domain = bo->allowed_domains;
842
}
843
844
retry:
845
amdgpu_bo_placement_from_domain(bo, domain);
846
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
847
848
p->bytes_moved += ctx.bytes_moved;
849
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
850
amdgpu_res_cpu_visible(adev, bo->tbo.resource))
851
p->bytes_moved_vis += ctx.bytes_moved;
852
853
if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
854
domain = bo->allowed_domains;
855
goto retry;
856
}
857
858
return r;
859
}
860
861
static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
862
union drm_amdgpu_cs *cs)
863
{
864
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
865
struct ttm_operation_ctx ctx = { true, false };
866
struct amdgpu_vm *vm = &fpriv->vm;
867
struct amdgpu_bo_list_entry *e;
868
struct drm_gem_object *obj;
869
unsigned long index;
870
unsigned int i;
871
int r;
872
873
/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
874
if (cs->in.bo_list_handle) {
875
if (p->bo_list)
876
return -EINVAL;
877
878
r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
879
&p->bo_list);
880
if (r)
881
return r;
882
} else if (!p->bo_list) {
883
/* Create a empty bo_list when no handle is provided */
884
r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
885
&p->bo_list);
886
if (r)
887
return r;
888
}
889
890
mutex_lock(&p->bo_list->bo_list_mutex);
891
892
/* Get userptr backing pages. If pages are updated after registered
893
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
894
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
895
*/
896
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
897
bool userpage_invalidated = false;
898
struct amdgpu_bo *bo = e->bo;
899
int i;
900
901
e->user_pages = kvcalloc(bo->tbo.ttm->num_pages,
902
sizeof(struct page *),
903
GFP_KERNEL);
904
if (!e->user_pages) {
905
DRM_ERROR("kvmalloc_array failure\n");
906
r = -ENOMEM;
907
goto out_free_user_pages;
908
}
909
910
r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range);
911
if (r) {
912
kvfree(e->user_pages);
913
e->user_pages = NULL;
914
goto out_free_user_pages;
915
}
916
917
for (i = 0; i < bo->tbo.ttm->num_pages; i++) {
918
if (bo->tbo.ttm->pages[i] != e->user_pages[i]) {
919
userpage_invalidated = true;
920
break;
921
}
922
}
923
e->user_invalidated = userpage_invalidated;
924
}
925
926
drm_exec_until_all_locked(&p->exec) {
927
r = amdgpu_vm_lock_pd(&fpriv->vm, &p->exec, 1 + p->gang_size);
928
drm_exec_retry_on_contention(&p->exec);
929
if (unlikely(r))
930
goto out_free_user_pages;
931
932
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
933
/* One fence for TTM and one for each CS job */
934
r = drm_exec_prepare_obj(&p->exec, &e->bo->tbo.base,
935
1 + p->gang_size);
936
drm_exec_retry_on_contention(&p->exec);
937
if (unlikely(r))
938
goto out_free_user_pages;
939
940
e->bo_va = amdgpu_vm_bo_find(vm, e->bo);
941
}
942
943
if (p->uf_bo) {
944
r = drm_exec_prepare_obj(&p->exec, &p->uf_bo->tbo.base,
945
1 + p->gang_size);
946
drm_exec_retry_on_contention(&p->exec);
947
if (unlikely(r))
948
goto out_free_user_pages;
949
}
950
}
951
952
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
953
struct mm_struct *usermm;
954
955
usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm);
956
if (usermm && usermm != current->mm) {
957
r = -EPERM;
958
goto out_free_user_pages;
959
}
960
961
if (amdgpu_ttm_tt_is_userptr(e->bo->tbo.ttm) &&
962
e->user_invalidated && e->user_pages) {
963
amdgpu_bo_placement_from_domain(e->bo,
964
AMDGPU_GEM_DOMAIN_CPU);
965
r = ttm_bo_validate(&e->bo->tbo, &e->bo->placement,
966
&ctx);
967
if (r)
968
goto out_free_user_pages;
969
970
amdgpu_ttm_tt_set_user_pages(e->bo->tbo.ttm,
971
e->user_pages);
972
}
973
974
kvfree(e->user_pages);
975
e->user_pages = NULL;
976
}
977
978
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
979
&p->bytes_moved_vis_threshold);
980
p->bytes_moved = 0;
981
p->bytes_moved_vis = 0;
982
983
r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
984
amdgpu_cs_bo_validate, p);
985
if (r) {
986
DRM_ERROR("amdgpu_vm_validate() failed.\n");
987
goto out_free_user_pages;
988
}
989
990
drm_exec_for_each_locked_object(&p->exec, index, obj) {
991
r = amdgpu_cs_bo_validate(p, gem_to_amdgpu_bo(obj));
992
if (unlikely(r))
993
goto out_free_user_pages;
994
}
995
996
if (p->uf_bo) {
997
r = amdgpu_ttm_alloc_gart(&p->uf_bo->tbo);
998
if (unlikely(r))
999
goto out_free_user_pages;
1000
1001
p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(p->uf_bo);
1002
}
1003
1004
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
1005
p->bytes_moved_vis);
1006
1007
for (i = 0; i < p->gang_size; ++i)
1008
amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
1009
p->bo_list->gws_obj,
1010
p->bo_list->oa_obj);
1011
return 0;
1012
1013
out_free_user_pages:
1014
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
1015
struct amdgpu_bo *bo = e->bo;
1016
1017
if (!e->user_pages)
1018
continue;
1019
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
1020
kvfree(e->user_pages);
1021
e->user_pages = NULL;
1022
e->range = NULL;
1023
}
1024
mutex_unlock(&p->bo_list->bo_list_mutex);
1025
return r;
1026
}
1027
1028
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
1029
{
1030
int i, j;
1031
1032
if (!trace_amdgpu_cs_enabled())
1033
return;
1034
1035
for (i = 0; i < p->gang_size; ++i) {
1036
struct amdgpu_job *job = p->jobs[i];
1037
1038
for (j = 0; j < job->num_ibs; ++j)
1039
trace_amdgpu_cs(p, job, &job->ibs[j]);
1040
}
1041
}
1042
1043
static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
1044
struct amdgpu_job *job)
1045
{
1046
struct amdgpu_ring *ring = amdgpu_job_ring(job);
1047
unsigned int i;
1048
int r;
1049
1050
/* Only for UVD/VCE VM emulation */
1051
if (!ring->funcs->parse_cs && !ring->funcs->patch_cs_in_place)
1052
return 0;
1053
1054
for (i = 0; i < job->num_ibs; ++i) {
1055
struct amdgpu_ib *ib = &job->ibs[i];
1056
struct amdgpu_bo_va_mapping *m;
1057
struct amdgpu_bo *aobj;
1058
uint64_t va_start;
1059
uint8_t *kptr;
1060
1061
va_start = ib->gpu_addr & AMDGPU_GMC_HOLE_MASK;
1062
r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
1063
if (r) {
1064
DRM_ERROR("IB va_start is invalid\n");
1065
return r;
1066
}
1067
1068
if ((va_start + ib->length_dw * 4) >
1069
(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
1070
DRM_ERROR("IB va_start+ib_bytes is invalid\n");
1071
return -EINVAL;
1072
}
1073
1074
/* the IB should be reserved at this point */
1075
r = amdgpu_bo_kmap(aobj, (void **)&kptr);
1076
if (r)
1077
return r;
1078
1079
kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE);
1080
1081
if (ring->funcs->parse_cs) {
1082
memcpy(ib->ptr, kptr, ib->length_dw * 4);
1083
amdgpu_bo_kunmap(aobj);
1084
1085
r = amdgpu_ring_parse_cs(ring, p, job, ib);
1086
if (r)
1087
return r;
1088
1089
if (ib->sa_bo)
1090
ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo);
1091
} else {
1092
ib->ptr = (uint32_t *)kptr;
1093
r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
1094
amdgpu_bo_kunmap(aobj);
1095
if (r)
1096
return r;
1097
}
1098
}
1099
1100
return 0;
1101
}
1102
1103
static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
1104
{
1105
unsigned int i;
1106
int r;
1107
1108
for (i = 0; i < p->gang_size; ++i) {
1109
r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
1110
if (r)
1111
return r;
1112
}
1113
return 0;
1114
}
1115
1116
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
1117
{
1118
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1119
struct amdgpu_job *job = p->gang_leader;
1120
struct amdgpu_device *adev = p->adev;
1121
struct amdgpu_vm *vm = &fpriv->vm;
1122
struct amdgpu_bo_list_entry *e;
1123
struct amdgpu_bo_va *bo_va;
1124
unsigned int i;
1125
int r;
1126
1127
/*
1128
* We can't use gang submit on with reserved VMIDs when the VM changes
1129
* can't be invalidated by more than one engine at the same time.
1130
*/
1131
if (p->gang_size > 1 && !adev->vm_manager.concurrent_flush) {
1132
for (i = 0; i < p->gang_size; ++i) {
1133
struct drm_sched_entity *entity = p->entities[i];
1134
struct drm_gpu_scheduler *sched = entity->rq->sched;
1135
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
1136
1137
if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub))
1138
return -EINVAL;
1139
}
1140
}
1141
1142
if (!amdgpu_vm_ready(vm))
1143
return -EINVAL;
1144
1145
r = amdgpu_vm_clear_freed(adev, vm, NULL);
1146
if (r)
1147
return r;
1148
1149
r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
1150
if (r)
1151
return r;
1152
1153
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update,
1154
GFP_KERNEL);
1155
if (r)
1156
return r;
1157
1158
if (fpriv->csa_va) {
1159
bo_va = fpriv->csa_va;
1160
BUG_ON(!bo_va);
1161
r = amdgpu_vm_bo_update(adev, bo_va, false);
1162
if (r)
1163
return r;
1164
1165
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1166
GFP_KERNEL);
1167
if (r)
1168
return r;
1169
}
1170
1171
/* FIXME: In theory this loop shouldn't be needed any more when
1172
* amdgpu_vm_handle_moved handles all moved BOs that are reserved
1173
* with p->ticket. But removing it caused test regressions, so I'm
1174
* leaving it here for now.
1175
*/
1176
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1177
bo_va = e->bo_va;
1178
if (bo_va == NULL)
1179
continue;
1180
1181
r = amdgpu_vm_bo_update(adev, bo_va, false);
1182
if (r)
1183
return r;
1184
1185
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1186
GFP_KERNEL);
1187
if (r)
1188
return r;
1189
}
1190
1191
r = amdgpu_vm_handle_moved(adev, vm, &p->exec.ticket);
1192
if (r)
1193
return r;
1194
1195
r = amdgpu_vm_update_pdes(adev, vm, false);
1196
if (r)
1197
return r;
1198
1199
r = amdgpu_sync_fence(&p->sync, vm->last_update, GFP_KERNEL);
1200
if (r)
1201
return r;
1202
1203
for (i = 0; i < p->gang_size; ++i) {
1204
job = p->jobs[i];
1205
1206
if (!job->vm)
1207
continue;
1208
1209
job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
1210
}
1211
1212
if (adev->debug_vm) {
1213
/* Invalidate all BOs to test for userspace bugs */
1214
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1215
struct amdgpu_bo *bo = e->bo;
1216
1217
/* ignore duplicates */
1218
if (!bo)
1219
continue;
1220
1221
amdgpu_vm_bo_invalidate(bo, false);
1222
}
1223
}
1224
1225
return 0;
1226
}
1227
1228
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
1229
{
1230
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1231
struct drm_gpu_scheduler *sched;
1232
struct drm_gem_object *obj;
1233
struct dma_fence *fence;
1234
unsigned long index;
1235
unsigned int i;
1236
int r;
1237
1238
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_leader_idx]);
1239
if (r) {
1240
if (r != -ERESTARTSYS)
1241
DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
1242
return r;
1243
}
1244
1245
drm_exec_for_each_locked_object(&p->exec, index, obj) {
1246
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
1247
1248
struct dma_resv *resv = bo->tbo.base.resv;
1249
enum amdgpu_sync_mode sync_mode;
1250
1251
sync_mode = amdgpu_bo_explicit_sync(bo) ?
1252
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
1253
r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
1254
&fpriv->vm);
1255
if (r)
1256
return r;
1257
}
1258
1259
for (i = 0; i < p->gang_size; ++i) {
1260
r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
1261
if (r)
1262
return r;
1263
}
1264
1265
sched = p->gang_leader->base.entity->rq->sched;
1266
while ((fence = amdgpu_sync_get_fence(&p->sync))) {
1267
struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
1268
1269
/*
1270
* When we have an dependency it might be necessary to insert a
1271
* pipeline sync to make sure that all caches etc are flushed and the
1272
* next job actually sees the results from the previous one
1273
* before we start executing on the same scheduler ring.
1274
*/
1275
if (!s_fence || s_fence->sched != sched) {
1276
dma_fence_put(fence);
1277
continue;
1278
}
1279
1280
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence,
1281
GFP_KERNEL);
1282
dma_fence_put(fence);
1283
if (r)
1284
return r;
1285
}
1286
return 0;
1287
}
1288
1289
static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
1290
{
1291
int i;
1292
1293
for (i = 0; i < p->num_post_deps; ++i) {
1294
if (p->post_deps[i].chain && p->post_deps[i].point) {
1295
drm_syncobj_add_point(p->post_deps[i].syncobj,
1296
p->post_deps[i].chain,
1297
p->fence, p->post_deps[i].point);
1298
p->post_deps[i].chain = NULL;
1299
} else {
1300
drm_syncobj_replace_fence(p->post_deps[i].syncobj,
1301
p->fence);
1302
}
1303
}
1304
}
1305
1306
static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
1307
union drm_amdgpu_cs *cs)
1308
{
1309
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1310
struct amdgpu_job *leader = p->gang_leader;
1311
struct amdgpu_bo_list_entry *e;
1312
struct drm_gem_object *gobj;
1313
unsigned long index;
1314
unsigned int i;
1315
uint64_t seq;
1316
int r;
1317
1318
for (i = 0; i < p->gang_size; ++i)
1319
drm_sched_job_arm(&p->jobs[i]->base);
1320
1321
for (i = 0; i < p->gang_size; ++i) {
1322
struct dma_fence *fence;
1323
1324
if (p->jobs[i] == leader)
1325
continue;
1326
1327
fence = &p->jobs[i]->base.s_fence->scheduled;
1328
dma_fence_get(fence);
1329
r = drm_sched_job_add_dependency(&leader->base, fence);
1330
if (r) {
1331
dma_fence_put(fence);
1332
return r;
1333
}
1334
}
1335
1336
if (p->gang_size > 1) {
1337
for (i = 0; i < p->gang_size; ++i)
1338
amdgpu_job_set_gang_leader(p->jobs[i], leader);
1339
}
1340
1341
/* No memory allocation is allowed while holding the notifier lock.
1342
* The lock is held until amdgpu_cs_submit is finished and fence is
1343
* added to BOs.
1344
*/
1345
mutex_lock(&p->adev->notifier_lock);
1346
1347
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
1348
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
1349
*/
1350
r = 0;
1351
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
1352
r |= !amdgpu_ttm_tt_get_user_pages_done(e->bo->tbo.ttm,
1353
e->range);
1354
e->range = NULL;
1355
}
1356
if (r) {
1357
r = -EAGAIN;
1358
mutex_unlock(&p->adev->notifier_lock);
1359
return r;
1360
}
1361
1362
p->fence = dma_fence_get(&leader->base.s_fence->finished);
1363
drm_exec_for_each_locked_object(&p->exec, index, gobj) {
1364
1365
ttm_bo_move_to_lru_tail_unlocked(&gem_to_amdgpu_bo(gobj)->tbo);
1366
1367
/* Everybody except for the gang leader uses READ */
1368
for (i = 0; i < p->gang_size; ++i) {
1369
if (p->jobs[i] == leader)
1370
continue;
1371
1372
dma_resv_add_fence(gobj->resv,
1373
&p->jobs[i]->base.s_fence->finished,
1374
DMA_RESV_USAGE_READ);
1375
}
1376
1377
/* The gang leader as remembered as writer */
1378
dma_resv_add_fence(gobj->resv, p->fence, DMA_RESV_USAGE_WRITE);
1379
}
1380
1381
seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_leader_idx],
1382
p->fence);
1383
amdgpu_cs_post_dependencies(p);
1384
1385
if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
1386
!p->ctx->preamble_presented) {
1387
leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
1388
p->ctx->preamble_presented = true;
1389
}
1390
1391
cs->out.handle = seq;
1392
leader->uf_sequence = seq;
1393
1394
amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->exec.ticket);
1395
for (i = 0; i < p->gang_size; ++i) {
1396
amdgpu_job_free_resources(p->jobs[i]);
1397
trace_amdgpu_cs_ioctl(p->jobs[i]);
1398
drm_sched_entity_push_job(&p->jobs[i]->base);
1399
p->jobs[i] = NULL;
1400
}
1401
1402
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
1403
1404
mutex_unlock(&p->adev->notifier_lock);
1405
mutex_unlock(&p->bo_list->bo_list_mutex);
1406
return 0;
1407
}
1408
1409
/* Cleanup the parser structure */
1410
static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
1411
{
1412
unsigned int i;
1413
1414
amdgpu_sync_free(&parser->sync);
1415
drm_exec_fini(&parser->exec);
1416
1417
for (i = 0; i < parser->num_post_deps; i++) {
1418
drm_syncobj_put(parser->post_deps[i].syncobj);
1419
kfree(parser->post_deps[i].chain);
1420
}
1421
kfree(parser->post_deps);
1422
1423
dma_fence_put(parser->fence);
1424
1425
if (parser->ctx)
1426
amdgpu_ctx_put(parser->ctx);
1427
if (parser->bo_list)
1428
amdgpu_bo_list_put(parser->bo_list);
1429
1430
for (i = 0; i < parser->nchunks; i++)
1431
kvfree(parser->chunks[i].kdata);
1432
kvfree(parser->chunks);
1433
for (i = 0; i < parser->gang_size; ++i) {
1434
if (parser->jobs[i])
1435
amdgpu_job_free(parser->jobs[i]);
1436
}
1437
amdgpu_bo_unref(&parser->uf_bo);
1438
}
1439
1440
int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
1441
{
1442
struct amdgpu_device *adev = drm_to_adev(dev);
1443
struct amdgpu_cs_parser parser;
1444
int r;
1445
1446
if (amdgpu_ras_intr_triggered())
1447
return -EHWPOISON;
1448
1449
if (!adev->accel_working)
1450
return -EBUSY;
1451
1452
r = amdgpu_cs_parser_init(&parser, adev, filp, data);
1453
if (r) {
1454
DRM_ERROR_RATELIMITED("Failed to initialize parser %d!\n", r);
1455
return r;
1456
}
1457
1458
r = amdgpu_cs_pass1(&parser, data);
1459
if (r)
1460
goto error_fini;
1461
1462
r = amdgpu_cs_pass2(&parser);
1463
if (r)
1464
goto error_fini;
1465
1466
r = amdgpu_cs_parser_bos(&parser, data);
1467
if (r) {
1468
if (r == -ENOMEM)
1469
DRM_ERROR("Not enough memory for command submission!\n");
1470
else if (r != -ERESTARTSYS && r != -EAGAIN)
1471
DRM_DEBUG("Failed to process the buffer list %d!\n", r);
1472
goto error_fini;
1473
}
1474
1475
r = amdgpu_cs_patch_jobs(&parser);
1476
if (r)
1477
goto error_backoff;
1478
1479
r = amdgpu_cs_vm_handling(&parser);
1480
if (r)
1481
goto error_backoff;
1482
1483
r = amdgpu_cs_sync_rings(&parser);
1484
if (r)
1485
goto error_backoff;
1486
1487
trace_amdgpu_cs_ibs(&parser);
1488
1489
r = amdgpu_cs_submit(&parser, data);
1490
if (r)
1491
goto error_backoff;
1492
1493
amdgpu_cs_parser_fini(&parser);
1494
return 0;
1495
1496
error_backoff:
1497
mutex_unlock(&parser.bo_list->bo_list_mutex);
1498
1499
error_fini:
1500
amdgpu_cs_parser_fini(&parser);
1501
return r;
1502
}
1503
1504
/**
1505
* amdgpu_cs_wait_ioctl - wait for a command submission to finish
1506
*
1507
* @dev: drm device
1508
* @data: data from userspace
1509
* @filp: file private
1510
*
1511
* Wait for the command submission identified by handle to finish.
1512
*/
1513
int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
1514
struct drm_file *filp)
1515
{
1516
union drm_amdgpu_wait_cs *wait = data;
1517
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
1518
struct drm_sched_entity *entity;
1519
struct amdgpu_ctx *ctx;
1520
struct dma_fence *fence;
1521
long r;
1522
1523
ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id);
1524
if (ctx == NULL)
1525
return -EINVAL;
1526
1527
r = amdgpu_ctx_get_entity(ctx, wait->in.ip_type, wait->in.ip_instance,
1528
wait->in.ring, &entity);
1529
if (r) {
1530
amdgpu_ctx_put(ctx);
1531
return r;
1532
}
1533
1534
fence = amdgpu_ctx_get_fence(ctx, entity, wait->in.handle);
1535
if (IS_ERR(fence))
1536
r = PTR_ERR(fence);
1537
else if (fence) {
1538
r = dma_fence_wait_timeout(fence, true, timeout);
1539
if (r > 0 && fence->error)
1540
r = fence->error;
1541
dma_fence_put(fence);
1542
} else
1543
r = 1;
1544
1545
amdgpu_ctx_put(ctx);
1546
if (r < 0)
1547
return r;
1548
1549
memset(wait, 0, sizeof(*wait));
1550
wait->out.status = (r == 0);
1551
1552
return 0;
1553
}
1554
1555
/**
1556
* amdgpu_cs_get_fence - helper to get fence from drm_amdgpu_fence
1557
*
1558
* @adev: amdgpu device
1559
* @filp: file private
1560
* @user: drm_amdgpu_fence copied from user space
1561
*/
1562
static struct dma_fence *amdgpu_cs_get_fence(struct amdgpu_device *adev,
1563
struct drm_file *filp,
1564
struct drm_amdgpu_fence *user)
1565
{
1566
struct drm_sched_entity *entity;
1567
struct amdgpu_ctx *ctx;
1568
struct dma_fence *fence;
1569
int r;
1570
1571
ctx = amdgpu_ctx_get(filp->driver_priv, user->ctx_id);
1572
if (ctx == NULL)
1573
return ERR_PTR(-EINVAL);
1574
1575
r = amdgpu_ctx_get_entity(ctx, user->ip_type, user->ip_instance,
1576
user->ring, &entity);
1577
if (r) {
1578
amdgpu_ctx_put(ctx);
1579
return ERR_PTR(r);
1580
}
1581
1582
fence = amdgpu_ctx_get_fence(ctx, entity, user->seq_no);
1583
amdgpu_ctx_put(ctx);
1584
1585
return fence;
1586
}
1587
1588
int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
1589
struct drm_file *filp)
1590
{
1591
struct amdgpu_device *adev = drm_to_adev(dev);
1592
union drm_amdgpu_fence_to_handle *info = data;
1593
struct dma_fence *fence;
1594
struct drm_syncobj *syncobj;
1595
struct sync_file *sync_file;
1596
int fd, r;
1597
1598
fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence);
1599
if (IS_ERR(fence))
1600
return PTR_ERR(fence);
1601
1602
if (!fence)
1603
fence = dma_fence_get_stub();
1604
1605
switch (info->in.what) {
1606
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ:
1607
r = drm_syncobj_create(&syncobj, 0, fence);
1608
dma_fence_put(fence);
1609
if (r)
1610
return r;
1611
r = drm_syncobj_get_handle(filp, syncobj, &info->out.handle);
1612
drm_syncobj_put(syncobj);
1613
return r;
1614
1615
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD:
1616
r = drm_syncobj_create(&syncobj, 0, fence);
1617
dma_fence_put(fence);
1618
if (r)
1619
return r;
1620
r = drm_syncobj_get_fd(syncobj, (int *)&info->out.handle);
1621
drm_syncobj_put(syncobj);
1622
return r;
1623
1624
case AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD:
1625
fd = get_unused_fd_flags(O_CLOEXEC);
1626
if (fd < 0) {
1627
dma_fence_put(fence);
1628
return fd;
1629
}
1630
1631
sync_file = sync_file_create(fence);
1632
dma_fence_put(fence);
1633
if (!sync_file) {
1634
put_unused_fd(fd);
1635
return -ENOMEM;
1636
}
1637
1638
fd_install(fd, sync_file->file);
1639
info->out.handle = fd;
1640
return 0;
1641
1642
default:
1643
dma_fence_put(fence);
1644
return -EINVAL;
1645
}
1646
}
1647
1648
/**
1649
* amdgpu_cs_wait_all_fences - wait on all fences to signal
1650
*
1651
* @adev: amdgpu device
1652
* @filp: file private
1653
* @wait: wait parameters
1654
* @fences: array of drm_amdgpu_fence
1655
*/
1656
static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev,
1657
struct drm_file *filp,
1658
union drm_amdgpu_wait_fences *wait,
1659
struct drm_amdgpu_fence *fences)
1660
{
1661
uint32_t fence_count = wait->in.fence_count;
1662
unsigned int i;
1663
long r = 1;
1664
1665
for (i = 0; i < fence_count; i++) {
1666
struct dma_fence *fence;
1667
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1668
1669
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1670
if (IS_ERR(fence))
1671
return PTR_ERR(fence);
1672
else if (!fence)
1673
continue;
1674
1675
r = dma_fence_wait_timeout(fence, true, timeout);
1676
if (r > 0 && fence->error)
1677
r = fence->error;
1678
1679
dma_fence_put(fence);
1680
if (r < 0)
1681
return r;
1682
1683
if (r == 0)
1684
break;
1685
}
1686
1687
memset(wait, 0, sizeof(*wait));
1688
wait->out.status = (r > 0);
1689
1690
return 0;
1691
}
1692
1693
/**
1694
* amdgpu_cs_wait_any_fence - wait on any fence to signal
1695
*
1696
* @adev: amdgpu device
1697
* @filp: file private
1698
* @wait: wait parameters
1699
* @fences: array of drm_amdgpu_fence
1700
*/
1701
static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
1702
struct drm_file *filp,
1703
union drm_amdgpu_wait_fences *wait,
1704
struct drm_amdgpu_fence *fences)
1705
{
1706
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1707
uint32_t fence_count = wait->in.fence_count;
1708
uint32_t first = ~0;
1709
struct dma_fence **array;
1710
unsigned int i;
1711
long r;
1712
1713
/* Prepare the fence array */
1714
array = kcalloc(fence_count, sizeof(struct dma_fence *), GFP_KERNEL);
1715
1716
if (array == NULL)
1717
return -ENOMEM;
1718
1719
for (i = 0; i < fence_count; i++) {
1720
struct dma_fence *fence;
1721
1722
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1723
if (IS_ERR(fence)) {
1724
r = PTR_ERR(fence);
1725
goto err_free_fence_array;
1726
} else if (fence) {
1727
array[i] = fence;
1728
} else { /* NULL, the fence has been already signaled */
1729
r = 1;
1730
first = i;
1731
goto out;
1732
}
1733
}
1734
1735
r = dma_fence_wait_any_timeout(array, fence_count, true, timeout,
1736
&first);
1737
if (r < 0)
1738
goto err_free_fence_array;
1739
1740
out:
1741
memset(wait, 0, sizeof(*wait));
1742
wait->out.status = (r > 0);
1743
wait->out.first_signaled = first;
1744
1745
if (first < fence_count && array[first])
1746
r = array[first]->error;
1747
else
1748
r = 0;
1749
1750
err_free_fence_array:
1751
for (i = 0; i < fence_count; i++)
1752
dma_fence_put(array[i]);
1753
kfree(array);
1754
1755
return r;
1756
}
1757
1758
/**
1759
* amdgpu_cs_wait_fences_ioctl - wait for multiple command submissions to finish
1760
*
1761
* @dev: drm device
1762
* @data: data from userspace
1763
* @filp: file private
1764
*/
1765
int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
1766
struct drm_file *filp)
1767
{
1768
struct amdgpu_device *adev = drm_to_adev(dev);
1769
union drm_amdgpu_wait_fences *wait = data;
1770
uint32_t fence_count = wait->in.fence_count;
1771
struct drm_amdgpu_fence *fences_user;
1772
struct drm_amdgpu_fence *fences;
1773
int r;
1774
1775
/* Get the fences from userspace */
1776
fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
1777
GFP_KERNEL);
1778
if (fences == NULL)
1779
return -ENOMEM;
1780
1781
fences_user = u64_to_user_ptr(wait->in.fences);
1782
if (copy_from_user(fences, fences_user,
1783
sizeof(struct drm_amdgpu_fence) * fence_count)) {
1784
r = -EFAULT;
1785
goto err_free_fences;
1786
}
1787
1788
if (wait->in.wait_all)
1789
r = amdgpu_cs_wait_all_fences(adev, filp, wait, fences);
1790
else
1791
r = amdgpu_cs_wait_any_fence(adev, filp, wait, fences);
1792
1793
err_free_fences:
1794
kfree(fences);
1795
1796
return r;
1797
}
1798
1799
/**
1800
* amdgpu_cs_find_mapping - find bo_va for VM address
1801
*
1802
* @parser: command submission parser context
1803
* @addr: VM address
1804
* @bo: resulting BO of the mapping found
1805
* @map: Placeholder to return found BO mapping
1806
*
1807
* Search the buffer objects in the command submission context for a certain
1808
* virtual memory address. Returns allocation structure when found, NULL
1809
* otherwise.
1810
*/
1811
int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
1812
uint64_t addr, struct amdgpu_bo **bo,
1813
struct amdgpu_bo_va_mapping **map)
1814
{
1815
struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
1816
struct ttm_operation_ctx ctx = { false, false };
1817
struct amdgpu_vm *vm = &fpriv->vm;
1818
struct amdgpu_bo_va_mapping *mapping;
1819
int i, r;
1820
1821
addr /= AMDGPU_GPU_PAGE_SIZE;
1822
1823
mapping = amdgpu_vm_bo_lookup_mapping(vm, addr);
1824
if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo)
1825
return -EINVAL;
1826
1827
*bo = mapping->bo_va->base.bo;
1828
*map = mapping;
1829
1830
/* Double check that the BO is reserved by this CS */
1831
if (dma_resv_locking_ctx((*bo)->tbo.base.resv) != &parser->exec.ticket)
1832
return -EINVAL;
1833
1834
/* Make sure VRAM is allocated contigiously */
1835
(*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
1836
if ((*bo)->tbo.resource->mem_type == TTM_PL_VRAM &&
1837
!((*bo)->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
1838
1839
amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains);
1840
for (i = 0; i < (*bo)->placement.num_placement; i++)
1841
(*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
1842
r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx);
1843
if (r)
1844
return r;
1845
}
1846
1847
return amdgpu_ttm_alloc_gart(&(*bo)->tbo);
1848
}
1849
1850