Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
49634 views
1
/*
2
* Copyright 2008 Jerome Glisse.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
* DEALINGS IN THE SOFTWARE.
23
*
24
* Authors:
25
* Jerome Glisse <[email protected]>
26
*/
27
28
#include <linux/file.h>
29
#include <linux/pagemap.h>
30
#include <linux/sync_file.h>
31
#include <linux/dma-buf.h>
32
33
#include <drm/amdgpu_drm.h>
34
#include <drm/drm_syncobj.h>
35
#include <drm/ttm/ttm_tt.h>
36
37
#include "amdgpu_cs.h"
38
#include "amdgpu.h"
39
#include "amdgpu_trace.h"
40
#include "amdgpu_gmc.h"
41
#include "amdgpu_gem.h"
42
#include "amdgpu_ras.h"
43
#include "amdgpu_hmm.h"
44
45
static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
46
struct amdgpu_device *adev,
47
struct drm_file *filp,
48
union drm_amdgpu_cs *cs)
49
{
50
struct amdgpu_fpriv *fpriv = filp->driver_priv;
51
52
if (cs->in.num_chunks == 0)
53
return -EINVAL;
54
55
memset(p, 0, sizeof(*p));
56
p->adev = adev;
57
p->filp = filp;
58
59
p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
60
if (!p->ctx)
61
return -EINVAL;
62
63
if (atomic_read(&p->ctx->guilty)) {
64
amdgpu_ctx_put(p->ctx);
65
return -ECANCELED;
66
}
67
68
amdgpu_sync_create(&p->sync);
69
drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
70
DRM_EXEC_IGNORE_DUPLICATES, 0);
71
return 0;
72
}
73
74
static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
75
struct drm_amdgpu_cs_chunk_ib *chunk_ib)
76
{
77
struct drm_sched_entity *entity;
78
unsigned int i;
79
int r;
80
81
r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
82
chunk_ib->ip_instance,
83
chunk_ib->ring, &entity);
84
if (r)
85
return r;
86
87
/*
88
* Abort if there is no run queue associated with this entity.
89
* Possibly because of disabled HW IP.
90
*/
91
if (entity->rq == NULL)
92
return -EINVAL;
93
94
/* Check if we can add this IB to some existing job */
95
for (i = 0; i < p->gang_size; ++i)
96
if (p->entities[i] == entity)
97
return i;
98
99
/* If not increase the gang size if possible */
100
if (i == AMDGPU_CS_GANG_SIZE)
101
return -EINVAL;
102
103
p->entities[i] = entity;
104
p->gang_size = i + 1;
105
return i;
106
}
107
108
static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
109
struct drm_amdgpu_cs_chunk_ib *chunk_ib,
110
unsigned int *num_ibs)
111
{
112
int r;
113
114
r = amdgpu_cs_job_idx(p, chunk_ib);
115
if (r < 0)
116
return r;
117
118
if (num_ibs[r] >= amdgpu_ring_max_ibs(chunk_ib->ip_type))
119
return -EINVAL;
120
121
++(num_ibs[r]);
122
p->gang_leader_idx = r;
123
return 0;
124
}
125
126
static int amdgpu_cs_p1_user_fence(struct amdgpu_cs_parser *p,
127
struct drm_amdgpu_cs_chunk_fence *data,
128
uint32_t *offset)
129
{
130
struct drm_gem_object *gobj;
131
unsigned long size;
132
133
gobj = drm_gem_object_lookup(p->filp, data->handle);
134
if (gobj == NULL)
135
return -EINVAL;
136
137
p->uf_bo = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj));
138
drm_gem_object_put(gobj);
139
140
size = amdgpu_bo_size(p->uf_bo);
141
if (size != PAGE_SIZE || data->offset > (size - 8))
142
return -EINVAL;
143
144
if (amdgpu_ttm_tt_get_usermm(p->uf_bo->tbo.ttm))
145
return -EINVAL;
146
147
*offset = data->offset;
148
return 0;
149
}
150
151
static int amdgpu_cs_p1_bo_handles(struct amdgpu_cs_parser *p,
152
struct drm_amdgpu_bo_list_in *data)
153
{
154
struct drm_amdgpu_bo_list_entry *info;
155
int r;
156
157
r = amdgpu_bo_create_list_entry_array(data, &info);
158
if (r)
159
return r;
160
161
r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number,
162
&p->bo_list);
163
if (r)
164
goto error_free;
165
166
kvfree(info);
167
return 0;
168
169
error_free:
170
kvfree(info);
171
172
return r;
173
}
174
175
/* Copy the data from userspace and go over it the first time */
176
static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
177
union drm_amdgpu_cs *cs)
178
{
179
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
180
unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
181
struct amdgpu_vm *vm = &fpriv->vm;
182
uint64_t *chunk_array;
183
uint32_t uf_offset = 0;
184
size_t size;
185
int ret;
186
int i;
187
188
chunk_array = memdup_array_user(u64_to_user_ptr(cs->in.chunks),
189
cs->in.num_chunks,
190
sizeof(uint64_t));
191
if (IS_ERR(chunk_array))
192
return PTR_ERR(chunk_array);
193
194
p->nchunks = cs->in.num_chunks;
195
p->chunks = kvmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk),
196
GFP_KERNEL);
197
if (!p->chunks) {
198
ret = -ENOMEM;
199
goto free_chunk;
200
}
201
202
for (i = 0; i < p->nchunks; i++) {
203
struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL;
204
struct drm_amdgpu_cs_chunk user_chunk;
205
206
chunk_ptr = u64_to_user_ptr(chunk_array[i]);
207
if (copy_from_user(&user_chunk, chunk_ptr,
208
sizeof(struct drm_amdgpu_cs_chunk))) {
209
ret = -EFAULT;
210
i--;
211
goto free_partial_kdata;
212
}
213
p->chunks[i].chunk_id = user_chunk.chunk_id;
214
p->chunks[i].length_dw = user_chunk.length_dw;
215
216
size = p->chunks[i].length_dw;
217
218
p->chunks[i].kdata = vmemdup_array_user(u64_to_user_ptr(user_chunk.chunk_data),
219
size,
220
sizeof(uint32_t));
221
if (IS_ERR(p->chunks[i].kdata)) {
222
ret = PTR_ERR(p->chunks[i].kdata);
223
i--;
224
goto free_partial_kdata;
225
}
226
size *= sizeof(uint32_t);
227
228
/* Assume the worst on the following checks */
229
ret = -EINVAL;
230
switch (p->chunks[i].chunk_id) {
231
case AMDGPU_CHUNK_ID_IB:
232
if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
233
goto free_partial_kdata;
234
235
ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
236
if (ret)
237
goto free_partial_kdata;
238
break;
239
240
case AMDGPU_CHUNK_ID_FENCE:
241
if (size < sizeof(struct drm_amdgpu_cs_chunk_fence))
242
goto free_partial_kdata;
243
244
ret = amdgpu_cs_p1_user_fence(p, p->chunks[i].kdata,
245
&uf_offset);
246
if (ret)
247
goto free_partial_kdata;
248
break;
249
250
case AMDGPU_CHUNK_ID_BO_HANDLES:
251
if (size < sizeof(struct drm_amdgpu_bo_list_in))
252
goto free_partial_kdata;
253
254
/* Only a single BO list is allowed to simplify handling. */
255
if (p->bo_list)
256
goto free_partial_kdata;
257
258
ret = amdgpu_cs_p1_bo_handles(p, p->chunks[i].kdata);
259
if (ret)
260
goto free_partial_kdata;
261
break;
262
263
case AMDGPU_CHUNK_ID_DEPENDENCIES:
264
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
265
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
266
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
267
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
268
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
269
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
270
break;
271
272
default:
273
goto free_partial_kdata;
274
}
275
}
276
277
if (!p->gang_size || (amdgpu_sriov_vf(p->adev) && p->gang_size > 1)) {
278
ret = -EINVAL;
279
goto free_all_kdata;
280
}
281
282
for (i = 0; i < p->gang_size; ++i) {
283
ret = amdgpu_job_alloc(p->adev, vm, p->entities[i], vm,
284
num_ibs[i], &p->jobs[i],
285
p->filp->client_id);
286
if (ret)
287
goto free_all_kdata;
288
switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
289
case AMDGPU_ENFORCE_ISOLATION_DISABLE:
290
default:
291
p->jobs[i]->enforce_isolation = false;
292
p->jobs[i]->run_cleaner_shader = false;
293
break;
294
case AMDGPU_ENFORCE_ISOLATION_ENABLE:
295
p->jobs[i]->enforce_isolation = true;
296
p->jobs[i]->run_cleaner_shader = true;
297
break;
298
case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
299
p->jobs[i]->enforce_isolation = true;
300
p->jobs[i]->run_cleaner_shader = false;
301
break;
302
case AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER:
303
p->jobs[i]->enforce_isolation = true;
304
p->jobs[i]->run_cleaner_shader = false;
305
break;
306
}
307
}
308
p->gang_leader = p->jobs[p->gang_leader_idx];
309
310
if (p->ctx->generation != p->gang_leader->generation) {
311
ret = -ECANCELED;
312
goto free_all_kdata;
313
}
314
315
if (p->uf_bo)
316
p->gang_leader->uf_addr = uf_offset;
317
kvfree(chunk_array);
318
319
/* Use this opportunity to fill in task info for the vm */
320
amdgpu_vm_set_task_info(vm);
321
322
return 0;
323
324
free_all_kdata:
325
i = p->nchunks - 1;
326
free_partial_kdata:
327
for (; i >= 0; i--)
328
kvfree(p->chunks[i].kdata);
329
kvfree(p->chunks);
330
p->chunks = NULL;
331
p->nchunks = 0;
332
free_chunk:
333
kvfree(chunk_array);
334
335
return ret;
336
}
337
338
static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
339
struct amdgpu_cs_chunk *chunk,
340
unsigned int *ce_preempt,
341
unsigned int *de_preempt)
342
{
343
struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
344
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
345
struct amdgpu_vm *vm = &fpriv->vm;
346
struct amdgpu_ring *ring;
347
struct amdgpu_job *job;
348
struct amdgpu_ib *ib;
349
int r;
350
351
r = amdgpu_cs_job_idx(p, chunk_ib);
352
if (r < 0)
353
return r;
354
355
job = p->jobs[r];
356
ring = amdgpu_job_ring(job);
357
ib = &job->ibs[job->num_ibs++];
358
359
/* submissions to kernel queues are disabled */
360
if (ring->no_user_submission)
361
return -EINVAL;
362
363
/* MM engine doesn't support user fences */
364
if (p->uf_bo && ring->funcs->no_user_fence)
365
return -EINVAL;
366
367
if (!p->adev->debug_enable_ce_cs &&
368
chunk_ib->flags & AMDGPU_IB_FLAG_CE) {
369
dev_err_ratelimited(p->adev->dev, "CE CS is blocked, use debug=0x400 to override\n");
370
return -EINVAL;
371
}
372
373
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
374
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
375
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
376
(*ce_preempt)++;
377
else
378
(*de_preempt)++;
379
380
/* Each GFX command submit allows only 1 IB max
381
* preemptible for CE & DE */
382
if (*ce_preempt > 1 || *de_preempt > 1)
383
return -EINVAL;
384
}
385
386
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
387
job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
388
389
r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
390
chunk_ib->ib_bytes : 0,
391
AMDGPU_IB_POOL_DELAYED, ib);
392
if (r) {
393
drm_err(adev_to_drm(p->adev), "Failed to get ib !\n");
394
return r;
395
}
396
397
ib->gpu_addr = chunk_ib->va_start;
398
ib->length_dw = chunk_ib->ib_bytes / 4;
399
ib->flags = chunk_ib->flags;
400
return 0;
401
}
402
403
static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
404
struct amdgpu_cs_chunk *chunk)
405
{
406
struct drm_amdgpu_cs_chunk_dep *deps = chunk->kdata;
407
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
408
unsigned int num_deps;
409
int i, r;
410
411
num_deps = chunk->length_dw * 4 /
412
sizeof(struct drm_amdgpu_cs_chunk_dep);
413
414
for (i = 0; i < num_deps; ++i) {
415
struct amdgpu_ctx *ctx;
416
struct drm_sched_entity *entity;
417
struct dma_fence *fence;
418
419
ctx = amdgpu_ctx_get(fpriv, deps[i].ctx_id);
420
if (ctx == NULL)
421
return -EINVAL;
422
423
r = amdgpu_ctx_get_entity(ctx, deps[i].ip_type,
424
deps[i].ip_instance,
425
deps[i].ring, &entity);
426
if (r) {
427
amdgpu_ctx_put(ctx);
428
return r;
429
}
430
431
fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle);
432
amdgpu_ctx_put(ctx);
433
434
if (IS_ERR(fence))
435
return PTR_ERR(fence);
436
else if (!fence)
437
continue;
438
439
if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
440
struct drm_sched_fence *s_fence;
441
struct dma_fence *old = fence;
442
443
s_fence = to_drm_sched_fence(fence);
444
fence = dma_fence_get(&s_fence->scheduled);
445
dma_fence_put(old);
446
}
447
448
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
449
dma_fence_put(fence);
450
if (r)
451
return r;
452
}
453
return 0;
454
}
455
456
static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
457
uint32_t handle, u64 point,
458
u64 flags)
459
{
460
struct dma_fence *fence;
461
int r;
462
463
r = drm_syncobj_find_fence(p->filp, handle, point, flags, &fence);
464
if (r) {
465
drm_err(adev_to_drm(p->adev), "syncobj %u failed to find fence @ %llu (%d)!\n",
466
handle, point, r);
467
return r;
468
}
469
470
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
471
dma_fence_put(fence);
472
return r;
473
}
474
475
static int amdgpu_cs_p2_syncobj_in(struct amdgpu_cs_parser *p,
476
struct amdgpu_cs_chunk *chunk)
477
{
478
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
479
unsigned int num_deps;
480
int i, r;
481
482
num_deps = chunk->length_dw * 4 /
483
sizeof(struct drm_amdgpu_cs_chunk_sem);
484
for (i = 0; i < num_deps; ++i) {
485
r = amdgpu_syncobj_lookup_and_add(p, deps[i].handle, 0, 0);
486
if (r)
487
return r;
488
}
489
490
return 0;
491
}
492
493
static int amdgpu_cs_p2_syncobj_timeline_wait(struct amdgpu_cs_parser *p,
494
struct amdgpu_cs_chunk *chunk)
495
{
496
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
497
unsigned int num_deps;
498
int i, r;
499
500
num_deps = chunk->length_dw * 4 /
501
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
502
for (i = 0; i < num_deps; ++i) {
503
r = amdgpu_syncobj_lookup_and_add(p, syncobj_deps[i].handle,
504
syncobj_deps[i].point,
505
syncobj_deps[i].flags);
506
if (r)
507
return r;
508
}
509
510
return 0;
511
}
512
513
static int amdgpu_cs_p2_syncobj_out(struct amdgpu_cs_parser *p,
514
struct amdgpu_cs_chunk *chunk)
515
{
516
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
517
unsigned int num_deps;
518
int i;
519
520
num_deps = chunk->length_dw * 4 /
521
sizeof(struct drm_amdgpu_cs_chunk_sem);
522
523
if (p->post_deps)
524
return -EINVAL;
525
526
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
527
GFP_KERNEL);
528
p->num_post_deps = 0;
529
530
if (!p->post_deps)
531
return -ENOMEM;
532
533
534
for (i = 0; i < num_deps; ++i) {
535
p->post_deps[i].syncobj =
536
drm_syncobj_find(p->filp, deps[i].handle);
537
if (!p->post_deps[i].syncobj)
538
return -EINVAL;
539
p->post_deps[i].chain = NULL;
540
p->post_deps[i].point = 0;
541
p->num_post_deps++;
542
}
543
544
return 0;
545
}
546
547
static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
548
struct amdgpu_cs_chunk *chunk)
549
{
550
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
551
unsigned int num_deps;
552
int i;
553
554
num_deps = chunk->length_dw * 4 /
555
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
556
557
if (p->post_deps)
558
return -EINVAL;
559
560
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
561
GFP_KERNEL);
562
p->num_post_deps = 0;
563
564
if (!p->post_deps)
565
return -ENOMEM;
566
567
for (i = 0; i < num_deps; ++i) {
568
struct amdgpu_cs_post_dep *dep = &p->post_deps[i];
569
570
dep->chain = NULL;
571
if (syncobj_deps[i].point) {
572
dep->chain = dma_fence_chain_alloc();
573
if (!dep->chain)
574
return -ENOMEM;
575
}
576
577
dep->syncobj = drm_syncobj_find(p->filp,
578
syncobj_deps[i].handle);
579
if (!dep->syncobj) {
580
dma_fence_chain_free(dep->chain);
581
return -EINVAL;
582
}
583
dep->point = syncobj_deps[i].point;
584
p->num_post_deps++;
585
}
586
587
return 0;
588
}
589
590
static int amdgpu_cs_p2_shadow(struct amdgpu_cs_parser *p,
591
struct amdgpu_cs_chunk *chunk)
592
{
593
struct drm_amdgpu_cs_chunk_cp_gfx_shadow *shadow = chunk->kdata;
594
int i;
595
596
if (shadow->flags & ~AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW)
597
return -EINVAL;
598
599
for (i = 0; i < p->gang_size; ++i) {
600
p->jobs[i]->shadow_va = shadow->shadow_va;
601
p->jobs[i]->csa_va = shadow->csa_va;
602
p->jobs[i]->gds_va = shadow->gds_va;
603
p->jobs[i]->init_shadow =
604
shadow->flags & AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
605
}
606
607
return 0;
608
}
609
610
static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
611
{
612
unsigned int ce_preempt = 0, de_preempt = 0;
613
int i, r;
614
615
for (i = 0; i < p->nchunks; ++i) {
616
struct amdgpu_cs_chunk *chunk;
617
618
chunk = &p->chunks[i];
619
620
switch (chunk->chunk_id) {
621
case AMDGPU_CHUNK_ID_IB:
622
r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
623
if (r)
624
return r;
625
break;
626
case AMDGPU_CHUNK_ID_DEPENDENCIES:
627
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
628
r = amdgpu_cs_p2_dependencies(p, chunk);
629
if (r)
630
return r;
631
break;
632
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
633
r = amdgpu_cs_p2_syncobj_in(p, chunk);
634
if (r)
635
return r;
636
break;
637
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
638
r = amdgpu_cs_p2_syncobj_out(p, chunk);
639
if (r)
640
return r;
641
break;
642
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
643
r = amdgpu_cs_p2_syncobj_timeline_wait(p, chunk);
644
if (r)
645
return r;
646
break;
647
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
648
r = amdgpu_cs_p2_syncobj_timeline_signal(p, chunk);
649
if (r)
650
return r;
651
break;
652
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
653
r = amdgpu_cs_p2_shadow(p, chunk);
654
if (r)
655
return r;
656
break;
657
}
658
}
659
660
return 0;
661
}
662
663
/* Convert microseconds to bytes. */
664
static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
665
{
666
if (us <= 0 || !adev->mm_stats.log2_max_MBps)
667
return 0;
668
669
/* Since accum_us is incremented by a million per second, just
670
* multiply it by the number of MB/s to get the number of bytes.
671
*/
672
return us << adev->mm_stats.log2_max_MBps;
673
}
674
675
static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
676
{
677
if (!adev->mm_stats.log2_max_MBps)
678
return 0;
679
680
return bytes >> adev->mm_stats.log2_max_MBps;
681
}
682
683
/* Returns how many bytes TTM can move right now. If no bytes can be moved,
684
* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
685
* which means it can go over the threshold once. If that happens, the driver
686
* will be in debt and no other buffer migrations can be done until that debt
687
* is repaid.
688
*
689
* This approach allows moving a buffer of any size (it's important to allow
690
* that).
691
*
692
* The currency is simply time in microseconds and it increases as the clock
693
* ticks. The accumulated microseconds (us) are converted to bytes and
694
* returned.
695
*/
696
static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
697
u64 *max_bytes,
698
u64 *max_vis_bytes)
699
{
700
s64 time_us, increment_us;
701
u64 free_vram, total_vram, used_vram;
702
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
703
* throttling.
704
*
705
* It means that in order to get full max MBps, at least 5 IBs per
706
* second must be submitted and not more than 200ms apart from each
707
* other.
708
*/
709
const s64 us_upper_bound = 200000;
710
711
if ((!adev->mm_stats.log2_max_MBps) || !ttm_resource_manager_used(&adev->mman.vram_mgr.manager)) {
712
*max_bytes = 0;
713
*max_vis_bytes = 0;
714
return;
715
}
716
717
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
718
used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
719
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
720
721
spin_lock(&adev->mm_stats.lock);
722
723
/* Increase the amount of accumulated us. */
724
time_us = ktime_to_us(ktime_get());
725
increment_us = time_us - adev->mm_stats.last_update_us;
726
adev->mm_stats.last_update_us = time_us;
727
adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
728
us_upper_bound);
729
730
/* This prevents the short period of low performance when the VRAM
731
* usage is low and the driver is in debt or doesn't have enough
732
* accumulated us to fill VRAM quickly.
733
*
734
* The situation can occur in these cases:
735
* - a lot of VRAM is freed by userspace
736
* - the presence of a big buffer causes a lot of evictions
737
* (solution: split buffers into smaller ones)
738
*
739
* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
740
* accum_us to a positive number.
741
*/
742
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
743
s64 min_us;
744
745
/* Be more aggressive on dGPUs. Try to fill a portion of free
746
* VRAM now.
747
*/
748
if (!(adev->flags & AMD_IS_APU))
749
min_us = bytes_to_us(adev, free_vram / 4);
750
else
751
min_us = 0; /* Reset accum_us on APUs. */
752
753
adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
754
}
755
756
/* This is set to 0 if the driver is in debt to disallow (optional)
757
* buffer moves.
758
*/
759
*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
760
761
/* Do the same for visible VRAM if half of it is free */
762
if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
763
u64 total_vis_vram = adev->gmc.visible_vram_size;
764
u64 used_vis_vram =
765
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
766
767
if (used_vis_vram < total_vis_vram) {
768
u64 free_vis_vram = total_vis_vram - used_vis_vram;
769
770
adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
771
increment_us, us_upper_bound);
772
773
if (free_vis_vram >= total_vis_vram / 2)
774
adev->mm_stats.accum_us_vis =
775
max(bytes_to_us(adev, free_vis_vram / 2),
776
adev->mm_stats.accum_us_vis);
777
}
778
779
*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
780
} else {
781
*max_vis_bytes = 0;
782
}
783
784
spin_unlock(&adev->mm_stats.lock);
785
}
786
787
/* Report how many bytes have really been moved for the last command
788
* submission. This can result in a debt that can stop buffer migrations
789
* temporarily.
790
*/
791
void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
792
u64 num_vis_bytes)
793
{
794
spin_lock(&adev->mm_stats.lock);
795
adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
796
adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
797
spin_unlock(&adev->mm_stats.lock);
798
}
799
800
static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
801
{
802
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
803
struct amdgpu_cs_parser *p = param;
804
struct ttm_operation_ctx ctx = {
805
.interruptible = true,
806
.no_wait_gpu = false,
807
.resv = bo->tbo.base.resv
808
};
809
uint32_t domain;
810
int r;
811
812
if (bo->tbo.pin_count)
813
return 0;
814
815
/* Don't move this buffer if we have depleted our allowance
816
* to move it. Don't move anything if the threshold is zero.
817
*/
818
if (p->bytes_moved < p->bytes_moved_threshold &&
819
(!bo->tbo.base.dma_buf ||
820
list_empty(&bo->tbo.base.dma_buf->attachments))) {
821
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
822
(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
823
/* And don't move a CPU_ACCESS_REQUIRED BO to limited
824
* visible VRAM if we've depleted our allowance to do
825
* that.
826
*/
827
if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
828
domain = bo->preferred_domains;
829
else
830
domain = bo->allowed_domains;
831
} else {
832
domain = bo->preferred_domains;
833
}
834
} else {
835
domain = bo->allowed_domains;
836
}
837
838
retry:
839
amdgpu_bo_placement_from_domain(bo, domain);
840
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
841
842
p->bytes_moved += ctx.bytes_moved;
843
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
844
amdgpu_res_cpu_visible(adev, bo->tbo.resource))
845
p->bytes_moved_vis += ctx.bytes_moved;
846
847
if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
848
domain = bo->allowed_domains;
849
goto retry;
850
}
851
852
return r;
853
}
854
855
static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
856
union drm_amdgpu_cs *cs)
857
{
858
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
859
struct ttm_operation_ctx ctx = { true, false };
860
struct amdgpu_vm *vm = &fpriv->vm;
861
struct amdgpu_bo_list_entry *e;
862
struct drm_gem_object *obj;
863
unsigned long index;
864
unsigned int i;
865
int r;
866
867
/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
868
if (cs->in.bo_list_handle) {
869
if (p->bo_list)
870
return -EINVAL;
871
872
r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
873
&p->bo_list);
874
if (r)
875
return r;
876
} else if (!p->bo_list) {
877
/* Create a empty bo_list when no handle is provided */
878
r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
879
&p->bo_list);
880
if (r)
881
return r;
882
}
883
884
mutex_lock(&p->bo_list->bo_list_mutex);
885
886
/* Get userptr backing pages. If pages are updated after registered
887
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
888
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
889
*/
890
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
891
bool userpage_invalidated = false;
892
struct amdgpu_bo *bo = e->bo;
893
894
e->range = amdgpu_hmm_range_alloc(NULL);
895
if (unlikely(!e->range))
896
return -ENOMEM;
897
898
r = amdgpu_ttm_tt_get_user_pages(bo, e->range);
899
if (r)
900
goto out_free_user_pages;
901
902
for (i = 0; i < bo->tbo.ttm->num_pages; i++) {
903
if (bo->tbo.ttm->pages[i] !=
904
hmm_pfn_to_page(e->range->hmm_range.hmm_pfns[i])) {
905
userpage_invalidated = true;
906
break;
907
}
908
}
909
e->user_invalidated = userpage_invalidated;
910
}
911
912
drm_exec_until_all_locked(&p->exec) {
913
r = amdgpu_vm_lock_pd(&fpriv->vm, &p->exec, 1 + p->gang_size);
914
drm_exec_retry_on_contention(&p->exec);
915
if (unlikely(r))
916
goto out_free_user_pages;
917
918
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
919
/* One fence for TTM and one for each CS job */
920
r = drm_exec_prepare_obj(&p->exec, &e->bo->tbo.base,
921
1 + p->gang_size);
922
drm_exec_retry_on_contention(&p->exec);
923
if (unlikely(r))
924
goto out_free_user_pages;
925
926
e->bo_va = amdgpu_vm_bo_find(vm, e->bo);
927
}
928
929
if (p->uf_bo) {
930
r = drm_exec_prepare_obj(&p->exec, &p->uf_bo->tbo.base,
931
1 + p->gang_size);
932
drm_exec_retry_on_contention(&p->exec);
933
if (unlikely(r))
934
goto out_free_user_pages;
935
}
936
}
937
938
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
939
struct mm_struct *usermm;
940
941
usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm);
942
if (usermm && usermm != current->mm) {
943
r = -EPERM;
944
goto out_free_user_pages;
945
}
946
947
if (amdgpu_ttm_tt_is_userptr(e->bo->tbo.ttm) &&
948
e->user_invalidated) {
949
amdgpu_bo_placement_from_domain(e->bo,
950
AMDGPU_GEM_DOMAIN_CPU);
951
r = ttm_bo_validate(&e->bo->tbo, &e->bo->placement,
952
&ctx);
953
if (r)
954
goto out_free_user_pages;
955
956
amdgpu_ttm_tt_set_user_pages(e->bo->tbo.ttm,
957
e->range);
958
}
959
}
960
961
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
962
&p->bytes_moved_vis_threshold);
963
p->bytes_moved = 0;
964
p->bytes_moved_vis = 0;
965
966
r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
967
amdgpu_cs_bo_validate, p);
968
if (r) {
969
drm_err(adev_to_drm(p->adev), "amdgpu_vm_validate() failed.\n");
970
goto out_free_user_pages;
971
}
972
973
drm_exec_for_each_locked_object(&p->exec, index, obj) {
974
r = amdgpu_cs_bo_validate(p, gem_to_amdgpu_bo(obj));
975
if (unlikely(r))
976
goto out_free_user_pages;
977
}
978
979
if (p->uf_bo) {
980
r = amdgpu_ttm_alloc_gart(&p->uf_bo->tbo);
981
if (unlikely(r))
982
goto out_free_user_pages;
983
984
p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(p->uf_bo);
985
}
986
987
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
988
p->bytes_moved_vis);
989
990
for (i = 0; i < p->gang_size; ++i)
991
amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
992
p->bo_list->gws_obj,
993
p->bo_list->oa_obj);
994
return 0;
995
996
out_free_user_pages:
997
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
998
amdgpu_hmm_range_free(e->range);
999
e->range = NULL;
1000
}
1001
mutex_unlock(&p->bo_list->bo_list_mutex);
1002
return r;
1003
}
1004
1005
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
1006
{
1007
int i, j;
1008
1009
if (!trace_amdgpu_cs_enabled())
1010
return;
1011
1012
for (i = 0; i < p->gang_size; ++i) {
1013
struct amdgpu_job *job = p->jobs[i];
1014
1015
for (j = 0; j < job->num_ibs; ++j)
1016
trace_amdgpu_cs(p, job, &job->ibs[j]);
1017
}
1018
}
1019
1020
static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
1021
struct amdgpu_job *job)
1022
{
1023
struct amdgpu_ring *ring = amdgpu_job_ring(job);
1024
unsigned int i;
1025
int r;
1026
1027
/* Only for UVD/VCE VM emulation */
1028
if (!ring->funcs->parse_cs && !ring->funcs->patch_cs_in_place)
1029
return 0;
1030
1031
for (i = 0; i < job->num_ibs; ++i) {
1032
struct amdgpu_ib *ib = &job->ibs[i];
1033
struct amdgpu_bo_va_mapping *m;
1034
struct amdgpu_bo *aobj;
1035
uint64_t va_start;
1036
uint8_t *kptr;
1037
1038
va_start = ib->gpu_addr & AMDGPU_GMC_HOLE_MASK;
1039
r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
1040
if (r) {
1041
drm_err(adev_to_drm(p->adev), "IB va_start is invalid\n");
1042
return r;
1043
}
1044
1045
if ((va_start + ib->length_dw * 4) >
1046
(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
1047
drm_err(adev_to_drm(p->adev), "IB va_start+ib_bytes is invalid\n");
1048
return -EINVAL;
1049
}
1050
1051
/* the IB should be reserved at this point */
1052
r = amdgpu_bo_kmap(aobj, (void **)&kptr);
1053
if (r)
1054
return r;
1055
1056
kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE);
1057
1058
if (ring->funcs->parse_cs) {
1059
memcpy(ib->ptr, kptr, ib->length_dw * 4);
1060
amdgpu_bo_kunmap(aobj);
1061
1062
r = amdgpu_ring_parse_cs(ring, p, job, ib);
1063
if (r)
1064
return r;
1065
1066
if (ib->sa_bo)
1067
ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo);
1068
} else {
1069
ib->ptr = (uint32_t *)kptr;
1070
r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
1071
amdgpu_bo_kunmap(aobj);
1072
if (r)
1073
return r;
1074
}
1075
}
1076
1077
return 0;
1078
}
1079
1080
static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
1081
{
1082
unsigned int i;
1083
int r;
1084
1085
for (i = 0; i < p->gang_size; ++i) {
1086
r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
1087
if (r)
1088
return r;
1089
}
1090
return 0;
1091
}
1092
1093
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
1094
{
1095
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1096
struct amdgpu_job *job = p->gang_leader;
1097
struct amdgpu_device *adev = p->adev;
1098
struct amdgpu_vm *vm = &fpriv->vm;
1099
struct amdgpu_bo_list_entry *e;
1100
struct amdgpu_bo_va *bo_va;
1101
unsigned int i;
1102
int r;
1103
1104
/*
1105
* We can't use gang submit on with reserved VMIDs when the VM changes
1106
* can't be invalidated by more than one engine at the same time.
1107
*/
1108
if (p->gang_size > 1 && !adev->vm_manager.concurrent_flush) {
1109
for (i = 0; i < p->gang_size; ++i) {
1110
struct drm_sched_entity *entity = p->entities[i];
1111
struct drm_gpu_scheduler *sched = entity->rq->sched;
1112
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
1113
1114
if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub))
1115
return -EINVAL;
1116
}
1117
}
1118
1119
if (!amdgpu_vm_ready(vm))
1120
return -EINVAL;
1121
1122
r = amdgpu_vm_clear_freed(adev, vm, NULL);
1123
if (r)
1124
return r;
1125
1126
r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
1127
if (r)
1128
return r;
1129
1130
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update,
1131
GFP_KERNEL);
1132
if (r)
1133
return r;
1134
1135
if (fpriv->csa_va) {
1136
bo_va = fpriv->csa_va;
1137
BUG_ON(!bo_va);
1138
r = amdgpu_vm_bo_update(adev, bo_va, false);
1139
if (r)
1140
return r;
1141
1142
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1143
GFP_KERNEL);
1144
if (r)
1145
return r;
1146
}
1147
1148
/* FIXME: In theory this loop shouldn't be needed any more when
1149
* amdgpu_vm_handle_moved handles all moved BOs that are reserved
1150
* with p->ticket. But removing it caused test regressions, so I'm
1151
* leaving it here for now.
1152
*/
1153
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1154
bo_va = e->bo_va;
1155
if (bo_va == NULL)
1156
continue;
1157
1158
r = amdgpu_vm_bo_update(adev, bo_va, false);
1159
if (r)
1160
return r;
1161
1162
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1163
GFP_KERNEL);
1164
if (r)
1165
return r;
1166
}
1167
1168
r = amdgpu_vm_handle_moved(adev, vm, &p->exec.ticket);
1169
if (r)
1170
return r;
1171
1172
r = amdgpu_vm_update_pdes(adev, vm, false);
1173
if (r)
1174
return r;
1175
1176
r = amdgpu_sync_fence(&p->sync, vm->last_update, GFP_KERNEL);
1177
if (r)
1178
return r;
1179
1180
for (i = 0; i < p->gang_size; ++i) {
1181
job = p->jobs[i];
1182
1183
if (!job->vm)
1184
continue;
1185
1186
job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
1187
}
1188
1189
if (adev->debug_vm) {
1190
/* Invalidate all BOs to test for userspace bugs */
1191
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1192
struct amdgpu_bo *bo = e->bo;
1193
1194
/* ignore duplicates */
1195
if (!bo)
1196
continue;
1197
1198
amdgpu_vm_bo_invalidate(bo, false);
1199
}
1200
}
1201
1202
return 0;
1203
}
1204
1205
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
1206
{
1207
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1208
struct drm_gpu_scheduler *sched;
1209
struct drm_gem_object *obj;
1210
struct dma_fence *fence;
1211
unsigned long index;
1212
unsigned int i;
1213
int r;
1214
1215
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_leader_idx]);
1216
if (r) {
1217
if (r != -ERESTARTSYS)
1218
drm_err(adev_to_drm(p->adev), "amdgpu_ctx_wait_prev_fence failed.\n");
1219
return r;
1220
}
1221
1222
drm_exec_for_each_locked_object(&p->exec, index, obj) {
1223
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
1224
1225
struct dma_resv *resv = bo->tbo.base.resv;
1226
enum amdgpu_sync_mode sync_mode;
1227
1228
sync_mode = amdgpu_bo_explicit_sync(bo) ?
1229
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
1230
r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
1231
&fpriv->vm);
1232
if (r)
1233
return r;
1234
}
1235
1236
for (i = 0; i < p->gang_size; ++i) {
1237
r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
1238
if (r)
1239
return r;
1240
}
1241
1242
sched = p->gang_leader->base.entity->rq->sched;
1243
while ((fence = amdgpu_sync_get_fence(&p->sync))) {
1244
struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
1245
1246
/*
1247
* When we have an dependency it might be necessary to insert a
1248
* pipeline sync to make sure that all caches etc are flushed and the
1249
* next job actually sees the results from the previous one
1250
* before we start executing on the same scheduler ring.
1251
*/
1252
if (!s_fence || s_fence->sched != sched) {
1253
dma_fence_put(fence);
1254
continue;
1255
}
1256
1257
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence,
1258
GFP_KERNEL);
1259
dma_fence_put(fence);
1260
if (r)
1261
return r;
1262
}
1263
return 0;
1264
}
1265
1266
static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
1267
{
1268
int i;
1269
1270
for (i = 0; i < p->num_post_deps; ++i) {
1271
if (p->post_deps[i].chain && p->post_deps[i].point) {
1272
drm_syncobj_add_point(p->post_deps[i].syncobj,
1273
p->post_deps[i].chain,
1274
p->fence, p->post_deps[i].point);
1275
p->post_deps[i].chain = NULL;
1276
} else {
1277
drm_syncobj_replace_fence(p->post_deps[i].syncobj,
1278
p->fence);
1279
}
1280
}
1281
}
1282
1283
static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
1284
union drm_amdgpu_cs *cs)
1285
{
1286
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1287
struct amdgpu_job *leader = p->gang_leader;
1288
struct amdgpu_bo_list_entry *e;
1289
struct drm_gem_object *gobj;
1290
unsigned long index;
1291
unsigned int i;
1292
uint64_t seq;
1293
int r;
1294
1295
for (i = 0; i < p->gang_size; ++i)
1296
drm_sched_job_arm(&p->jobs[i]->base);
1297
1298
for (i = 0; i < p->gang_size; ++i) {
1299
struct dma_fence *fence;
1300
1301
if (p->jobs[i] == leader)
1302
continue;
1303
1304
fence = &p->jobs[i]->base.s_fence->scheduled;
1305
dma_fence_get(fence);
1306
r = drm_sched_job_add_dependency(&leader->base, fence);
1307
if (r) {
1308
dma_fence_put(fence);
1309
return r;
1310
}
1311
}
1312
1313
if (p->gang_size > 1) {
1314
for (i = 0; i < p->gang_size; ++i)
1315
amdgpu_job_set_gang_leader(p->jobs[i], leader);
1316
}
1317
1318
/* No memory allocation is allowed while holding the notifier lock.
1319
* The lock is held until amdgpu_cs_submit is finished and fence is
1320
* added to BOs.
1321
*/
1322
mutex_lock(&p->adev->notifier_lock);
1323
1324
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
1325
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
1326
*/
1327
r = 0;
1328
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
1329
r |= !amdgpu_hmm_range_valid(e->range);
1330
amdgpu_hmm_range_free(e->range);
1331
e->range = NULL;
1332
}
1333
if (r) {
1334
r = -EAGAIN;
1335
mutex_unlock(&p->adev->notifier_lock);
1336
return r;
1337
}
1338
1339
p->fence = dma_fence_get(&leader->base.s_fence->finished);
1340
drm_exec_for_each_locked_object(&p->exec, index, gobj) {
1341
1342
ttm_bo_move_to_lru_tail_unlocked(&gem_to_amdgpu_bo(gobj)->tbo);
1343
1344
/* Everybody except for the gang leader uses READ */
1345
for (i = 0; i < p->gang_size; ++i) {
1346
if (p->jobs[i] == leader)
1347
continue;
1348
1349
dma_resv_add_fence(gobj->resv,
1350
&p->jobs[i]->base.s_fence->finished,
1351
DMA_RESV_USAGE_READ);
1352
}
1353
1354
/* The gang leader as remembered as writer */
1355
dma_resv_add_fence(gobj->resv, p->fence, DMA_RESV_USAGE_WRITE);
1356
}
1357
1358
seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_leader_idx],
1359
p->fence);
1360
amdgpu_cs_post_dependencies(p);
1361
1362
if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
1363
!p->ctx->preamble_presented) {
1364
leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
1365
p->ctx->preamble_presented = true;
1366
}
1367
1368
cs->out.handle = seq;
1369
leader->uf_sequence = seq;
1370
1371
amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->exec.ticket);
1372
for (i = 0; i < p->gang_size; ++i) {
1373
amdgpu_job_free_resources(p->jobs[i]);
1374
trace_amdgpu_cs_ioctl(p->jobs[i]);
1375
drm_sched_entity_push_job(&p->jobs[i]->base);
1376
p->jobs[i] = NULL;
1377
}
1378
1379
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
1380
1381
mutex_unlock(&p->adev->notifier_lock);
1382
mutex_unlock(&p->bo_list->bo_list_mutex);
1383
return 0;
1384
}
1385
1386
/* Cleanup the parser structure */
1387
static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
1388
{
1389
unsigned int i;
1390
1391
amdgpu_sync_free(&parser->sync);
1392
drm_exec_fini(&parser->exec);
1393
1394
for (i = 0; i < parser->num_post_deps; i++) {
1395
drm_syncobj_put(parser->post_deps[i].syncobj);
1396
kfree(parser->post_deps[i].chain);
1397
}
1398
kfree(parser->post_deps);
1399
1400
dma_fence_put(parser->fence);
1401
1402
if (parser->ctx)
1403
amdgpu_ctx_put(parser->ctx);
1404
if (parser->bo_list)
1405
amdgpu_bo_list_put(parser->bo_list);
1406
1407
for (i = 0; i < parser->nchunks; i++)
1408
kvfree(parser->chunks[i].kdata);
1409
kvfree(parser->chunks);
1410
for (i = 0; i < parser->gang_size; ++i) {
1411
if (parser->jobs[i])
1412
amdgpu_job_free(parser->jobs[i]);
1413
}
1414
amdgpu_bo_unref(&parser->uf_bo);
1415
}
1416
1417
int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
1418
{
1419
struct amdgpu_device *adev = drm_to_adev(dev);
1420
struct amdgpu_cs_parser parser;
1421
int r;
1422
1423
if (amdgpu_ras_intr_triggered())
1424
return -EHWPOISON;
1425
1426
if (!adev->accel_working)
1427
return -EBUSY;
1428
1429
r = amdgpu_cs_parser_init(&parser, adev, filp, data);
1430
if (r) {
1431
drm_err_ratelimited(dev, "Failed to initialize parser %d!\n", r);
1432
return r;
1433
}
1434
1435
r = amdgpu_cs_pass1(&parser, data);
1436
if (r)
1437
goto error_fini;
1438
1439
r = amdgpu_cs_pass2(&parser);
1440
if (r)
1441
goto error_fini;
1442
1443
r = amdgpu_cs_parser_bos(&parser, data);
1444
if (r) {
1445
if (r == -ENOMEM)
1446
drm_err(dev, "Not enough memory for command submission!\n");
1447
else if (r != -ERESTARTSYS && r != -EAGAIN)
1448
drm_dbg(dev, "Failed to process the buffer list %d!\n", r);
1449
goto error_fini;
1450
}
1451
1452
r = amdgpu_cs_patch_jobs(&parser);
1453
if (r)
1454
goto error_backoff;
1455
1456
r = amdgpu_cs_vm_handling(&parser);
1457
if (r)
1458
goto error_backoff;
1459
1460
r = amdgpu_cs_sync_rings(&parser);
1461
if (r)
1462
goto error_backoff;
1463
1464
trace_amdgpu_cs_ibs(&parser);
1465
1466
r = amdgpu_cs_submit(&parser, data);
1467
if (r)
1468
goto error_backoff;
1469
1470
amdgpu_cs_parser_fini(&parser);
1471
return 0;
1472
1473
error_backoff:
1474
mutex_unlock(&parser.bo_list->bo_list_mutex);
1475
1476
error_fini:
1477
amdgpu_cs_parser_fini(&parser);
1478
return r;
1479
}
1480
1481
/**
1482
* amdgpu_cs_wait_ioctl - wait for a command submission to finish
1483
*
1484
* @dev: drm device
1485
* @data: data from userspace
1486
* @filp: file private
1487
*
1488
* Wait for the command submission identified by handle to finish.
1489
*/
1490
int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
1491
struct drm_file *filp)
1492
{
1493
union drm_amdgpu_wait_cs *wait = data;
1494
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
1495
struct drm_sched_entity *entity;
1496
struct amdgpu_ctx *ctx;
1497
struct dma_fence *fence;
1498
long r;
1499
1500
ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id);
1501
if (ctx == NULL)
1502
return -EINVAL;
1503
1504
r = amdgpu_ctx_get_entity(ctx, wait->in.ip_type, wait->in.ip_instance,
1505
wait->in.ring, &entity);
1506
if (r) {
1507
amdgpu_ctx_put(ctx);
1508
return r;
1509
}
1510
1511
fence = amdgpu_ctx_get_fence(ctx, entity, wait->in.handle);
1512
if (IS_ERR(fence))
1513
r = PTR_ERR(fence);
1514
else if (fence) {
1515
r = dma_fence_wait_timeout(fence, true, timeout);
1516
if (r > 0 && fence->error)
1517
r = fence->error;
1518
dma_fence_put(fence);
1519
} else
1520
r = 1;
1521
1522
amdgpu_ctx_put(ctx);
1523
if (r < 0)
1524
return r;
1525
1526
memset(wait, 0, sizeof(*wait));
1527
wait->out.status = (r == 0);
1528
1529
return 0;
1530
}
1531
1532
/**
1533
* amdgpu_cs_get_fence - helper to get fence from drm_amdgpu_fence
1534
*
1535
* @adev: amdgpu device
1536
* @filp: file private
1537
* @user: drm_amdgpu_fence copied from user space
1538
*/
1539
static struct dma_fence *amdgpu_cs_get_fence(struct amdgpu_device *adev,
1540
struct drm_file *filp,
1541
struct drm_amdgpu_fence *user)
1542
{
1543
struct drm_sched_entity *entity;
1544
struct amdgpu_ctx *ctx;
1545
struct dma_fence *fence;
1546
int r;
1547
1548
ctx = amdgpu_ctx_get(filp->driver_priv, user->ctx_id);
1549
if (ctx == NULL)
1550
return ERR_PTR(-EINVAL);
1551
1552
r = amdgpu_ctx_get_entity(ctx, user->ip_type, user->ip_instance,
1553
user->ring, &entity);
1554
if (r) {
1555
amdgpu_ctx_put(ctx);
1556
return ERR_PTR(r);
1557
}
1558
1559
fence = amdgpu_ctx_get_fence(ctx, entity, user->seq_no);
1560
amdgpu_ctx_put(ctx);
1561
1562
return fence;
1563
}
1564
1565
int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
1566
struct drm_file *filp)
1567
{
1568
struct amdgpu_device *adev = drm_to_adev(dev);
1569
union drm_amdgpu_fence_to_handle *info = data;
1570
struct dma_fence *fence;
1571
struct drm_syncobj *syncobj;
1572
struct sync_file *sync_file;
1573
int fd, r;
1574
1575
fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence);
1576
if (IS_ERR(fence))
1577
return PTR_ERR(fence);
1578
1579
if (!fence)
1580
fence = dma_fence_get_stub();
1581
1582
switch (info->in.what) {
1583
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ:
1584
r = drm_syncobj_create(&syncobj, 0, fence);
1585
dma_fence_put(fence);
1586
if (r)
1587
return r;
1588
r = drm_syncobj_get_handle(filp, syncobj, &info->out.handle);
1589
drm_syncobj_put(syncobj);
1590
return r;
1591
1592
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD:
1593
r = drm_syncobj_create(&syncobj, 0, fence);
1594
dma_fence_put(fence);
1595
if (r)
1596
return r;
1597
r = drm_syncobj_get_fd(syncobj, (int *)&info->out.handle);
1598
drm_syncobj_put(syncobj);
1599
return r;
1600
1601
case AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD:
1602
fd = get_unused_fd_flags(O_CLOEXEC);
1603
if (fd < 0) {
1604
dma_fence_put(fence);
1605
return fd;
1606
}
1607
1608
sync_file = sync_file_create(fence);
1609
dma_fence_put(fence);
1610
if (!sync_file) {
1611
put_unused_fd(fd);
1612
return -ENOMEM;
1613
}
1614
1615
fd_install(fd, sync_file->file);
1616
info->out.handle = fd;
1617
return 0;
1618
1619
default:
1620
dma_fence_put(fence);
1621
return -EINVAL;
1622
}
1623
}
1624
1625
/**
1626
* amdgpu_cs_wait_all_fences - wait on all fences to signal
1627
*
1628
* @adev: amdgpu device
1629
* @filp: file private
1630
* @wait: wait parameters
1631
* @fences: array of drm_amdgpu_fence
1632
*/
1633
static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev,
1634
struct drm_file *filp,
1635
union drm_amdgpu_wait_fences *wait,
1636
struct drm_amdgpu_fence *fences)
1637
{
1638
uint32_t fence_count = wait->in.fence_count;
1639
unsigned int i;
1640
long r = 1;
1641
1642
for (i = 0; i < fence_count; i++) {
1643
struct dma_fence *fence;
1644
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1645
1646
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1647
if (IS_ERR(fence))
1648
return PTR_ERR(fence);
1649
else if (!fence)
1650
continue;
1651
1652
r = dma_fence_wait_timeout(fence, true, timeout);
1653
if (r > 0 && fence->error)
1654
r = fence->error;
1655
1656
dma_fence_put(fence);
1657
if (r < 0)
1658
return r;
1659
1660
if (r == 0)
1661
break;
1662
}
1663
1664
memset(wait, 0, sizeof(*wait));
1665
wait->out.status = (r > 0);
1666
1667
return 0;
1668
}
1669
1670
/**
1671
* amdgpu_cs_wait_any_fence - wait on any fence to signal
1672
*
1673
* @adev: amdgpu device
1674
* @filp: file private
1675
* @wait: wait parameters
1676
* @fences: array of drm_amdgpu_fence
1677
*/
1678
static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
1679
struct drm_file *filp,
1680
union drm_amdgpu_wait_fences *wait,
1681
struct drm_amdgpu_fence *fences)
1682
{
1683
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1684
uint32_t fence_count = wait->in.fence_count;
1685
uint32_t first = ~0;
1686
struct dma_fence **array;
1687
unsigned int i;
1688
long r;
1689
1690
/* Prepare the fence array */
1691
array = kcalloc(fence_count, sizeof(struct dma_fence *), GFP_KERNEL);
1692
1693
if (array == NULL)
1694
return -ENOMEM;
1695
1696
for (i = 0; i < fence_count; i++) {
1697
struct dma_fence *fence;
1698
1699
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1700
if (IS_ERR(fence)) {
1701
r = PTR_ERR(fence);
1702
goto err_free_fence_array;
1703
} else if (fence) {
1704
array[i] = fence;
1705
} else { /* NULL, the fence has been already signaled */
1706
r = 1;
1707
first = i;
1708
goto out;
1709
}
1710
}
1711
1712
r = dma_fence_wait_any_timeout(array, fence_count, true, timeout,
1713
&first);
1714
if (r < 0)
1715
goto err_free_fence_array;
1716
1717
out:
1718
memset(wait, 0, sizeof(*wait));
1719
wait->out.status = (r > 0);
1720
wait->out.first_signaled = first;
1721
1722
if (first < fence_count && array[first])
1723
r = array[first]->error;
1724
else
1725
r = 0;
1726
1727
err_free_fence_array:
1728
for (i = 0; i < fence_count; i++)
1729
dma_fence_put(array[i]);
1730
kfree(array);
1731
1732
return r;
1733
}
1734
1735
/**
1736
* amdgpu_cs_wait_fences_ioctl - wait for multiple command submissions to finish
1737
*
1738
* @dev: drm device
1739
* @data: data from userspace
1740
* @filp: file private
1741
*/
1742
int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
1743
struct drm_file *filp)
1744
{
1745
struct amdgpu_device *adev = drm_to_adev(dev);
1746
union drm_amdgpu_wait_fences *wait = data;
1747
struct drm_amdgpu_fence *fences;
1748
int r;
1749
1750
/* Get the fences from userspace */
1751
fences = memdup_array_user(u64_to_user_ptr(wait->in.fences),
1752
wait->in.fence_count,
1753
sizeof(struct drm_amdgpu_fence));
1754
if (IS_ERR(fences))
1755
return PTR_ERR(fences);
1756
1757
if (wait->in.wait_all)
1758
r = amdgpu_cs_wait_all_fences(adev, filp, wait, fences);
1759
else
1760
r = amdgpu_cs_wait_any_fence(adev, filp, wait, fences);
1761
1762
kfree(fences);
1763
1764
return r;
1765
}
1766
1767
/**
1768
* amdgpu_cs_find_mapping - find bo_va for VM address
1769
*
1770
* @parser: command submission parser context
1771
* @addr: VM address
1772
* @bo: resulting BO of the mapping found
1773
* @map: Placeholder to return found BO mapping
1774
*
1775
* Search the buffer objects in the command submission context for a certain
1776
* virtual memory address. Returns allocation structure when found, NULL
1777
* otherwise.
1778
*/
1779
int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
1780
uint64_t addr, struct amdgpu_bo **bo,
1781
struct amdgpu_bo_va_mapping **map)
1782
{
1783
struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
1784
struct ttm_operation_ctx ctx = { false, false };
1785
struct amdgpu_vm *vm = &fpriv->vm;
1786
struct amdgpu_bo_va_mapping *mapping;
1787
int i, r;
1788
1789
addr /= AMDGPU_GPU_PAGE_SIZE;
1790
1791
mapping = amdgpu_vm_bo_lookup_mapping(vm, addr);
1792
if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo)
1793
return -EINVAL;
1794
1795
*bo = mapping->bo_va->base.bo;
1796
*map = mapping;
1797
1798
/* Double check that the BO is reserved by this CS */
1799
if (dma_resv_locking_ctx((*bo)->tbo.base.resv) != &parser->exec.ticket)
1800
return -EINVAL;
1801
1802
/* Make sure VRAM is allocated contigiously */
1803
(*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
1804
if ((*bo)->tbo.resource->mem_type == TTM_PL_VRAM &&
1805
!((*bo)->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
1806
1807
amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains);
1808
for (i = 0; i < (*bo)->placement.num_placement; i++)
1809
(*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
1810
r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx);
1811
if (r)
1812
return r;
1813
}
1814
1815
return amdgpu_ttm_alloc_gart(&(*bo)->tbo);
1816
}
1817
1818