Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_events.c
26516 views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
* Copyright 2014-2022 Advanced Micro Devices, Inc.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be included in
13
* all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
* OTHER DEALINGS IN THE SOFTWARE.
22
*/
23
24
#include <linux/mm_types.h>
25
#include <linux/slab.h>
26
#include <linux/types.h>
27
#include <linux/sched/signal.h>
28
#include <linux/sched/mm.h>
29
#include <linux/uaccess.h>
30
#include <linux/mman.h>
31
#include <linux/memory.h>
32
#include "kfd_priv.h"
33
#include "kfd_events.h"
34
#include "kfd_device_queue_manager.h"
35
#include <linux/device.h>
36
37
/*
38
* Wrapper around wait_queue_entry_t
39
*/
40
struct kfd_event_waiter {
41
wait_queue_entry_t wait;
42
struct kfd_event *event; /* Event to wait for */
43
bool activated; /* Becomes true when event is signaled */
44
bool event_age_enabled; /* set to true when last_event_age is non-zero */
45
};
46
47
/*
48
* Each signal event needs a 64-bit signal slot where the signaler will write
49
* a 1 before sending an interrupt. (This is needed because some interrupts
50
* do not contain enough spare data bits to identify an event.)
51
* We get whole pages and map them to the process VA.
52
* Individual signal events use their event_id as slot index.
53
*/
54
struct kfd_signal_page {
55
uint64_t *kernel_address;
56
uint64_t __user *user_address;
57
bool need_to_free_pages;
58
};
59
60
static uint64_t *page_slots(struct kfd_signal_page *page)
61
{
62
return page->kernel_address;
63
}
64
65
static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
66
{
67
void *backing_store;
68
struct kfd_signal_page *page;
69
70
page = kzalloc(sizeof(*page), GFP_KERNEL);
71
if (!page)
72
return NULL;
73
74
backing_store = (void *) __get_free_pages(GFP_KERNEL,
75
get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
76
if (!backing_store)
77
goto fail_alloc_signal_store;
78
79
/* Initialize all events to unsignaled */
80
memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
81
KFD_SIGNAL_EVENT_LIMIT * 8);
82
83
page->kernel_address = backing_store;
84
page->need_to_free_pages = true;
85
pr_debug("Allocated new event signal page at %p, for process %p\n",
86
page, p);
87
88
return page;
89
90
fail_alloc_signal_store:
91
kfree(page);
92
return NULL;
93
}
94
95
static int allocate_event_notification_slot(struct kfd_process *p,
96
struct kfd_event *ev,
97
const int *restore_id)
98
{
99
int id;
100
101
if (!p->signal_page) {
102
p->signal_page = allocate_signal_page(p);
103
if (!p->signal_page)
104
return -ENOMEM;
105
/* Oldest user mode expects 256 event slots */
106
p->signal_mapped_size = 256*8;
107
}
108
109
if (restore_id) {
110
id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
111
GFP_KERNEL);
112
} else {
113
/*
114
* Compatibility with old user mode: Only use signal slots
115
* user mode has mapped, may be less than
116
* KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
117
* of the event limit without breaking user mode.
118
*/
119
id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
120
GFP_KERNEL);
121
}
122
if (id < 0)
123
return id;
124
125
ev->event_id = id;
126
page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
127
128
return 0;
129
}
130
131
/*
132
* Assumes that p->event_mutex or rcu_readlock is held and of course that p is
133
* not going away.
134
*/
135
static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
136
{
137
return idr_find(&p->event_idr, id);
138
}
139
140
/**
141
* lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
142
* @p: Pointer to struct kfd_process
143
* @id: ID to look up
144
* @bits: Number of valid bits in @id
145
*
146
* Finds the first signaled event with a matching partial ID. If no
147
* matching signaled event is found, returns NULL. In that case the
148
* caller should assume that the partial ID is invalid and do an
149
* exhaustive search of all siglaned events.
150
*
151
* If multiple events with the same partial ID signal at the same
152
* time, they will be found one interrupt at a time, not necessarily
153
* in the same order the interrupts occurred. As long as the number of
154
* interrupts is correct, all signaled events will be seen by the
155
* driver.
156
*/
157
static struct kfd_event *lookup_signaled_event_by_partial_id(
158
struct kfd_process *p, uint32_t id, uint32_t bits)
159
{
160
struct kfd_event *ev;
161
162
if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
163
return NULL;
164
165
/* Fast path for the common case that @id is not a partial ID
166
* and we only need a single lookup.
167
*/
168
if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
169
if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
170
return NULL;
171
172
return idr_find(&p->event_idr, id);
173
}
174
175
/* General case for partial IDs: Iterate over all matching IDs
176
* and find the first one that has signaled.
177
*/
178
for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
179
if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
180
continue;
181
182
ev = idr_find(&p->event_idr, id);
183
}
184
185
return ev;
186
}
187
188
static int create_signal_event(struct file *devkfd, struct kfd_process *p,
189
struct kfd_event *ev, const int *restore_id)
190
{
191
int ret;
192
193
if (p->signal_mapped_size &&
194
p->signal_event_count == p->signal_mapped_size / 8) {
195
if (!p->signal_event_limit_reached) {
196
pr_debug("Signal event wasn't created because limit was reached\n");
197
p->signal_event_limit_reached = true;
198
}
199
return -ENOSPC;
200
}
201
202
ret = allocate_event_notification_slot(p, ev, restore_id);
203
if (ret) {
204
pr_warn("Signal event wasn't created because out of kernel memory\n");
205
return ret;
206
}
207
208
p->signal_event_count++;
209
210
ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
211
pr_debug("Signal event number %zu created with id %d, address %p\n",
212
p->signal_event_count, ev->event_id,
213
ev->user_signal_address);
214
215
return 0;
216
}
217
218
static int create_other_event(struct kfd_process *p, struct kfd_event *ev, const int *restore_id)
219
{
220
int id;
221
222
if (restore_id)
223
id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
224
GFP_KERNEL);
225
else
226
/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
227
* intentional integer overflow to -1 without a compiler
228
* warning. idr_alloc treats a negative value as "maximum
229
* signed integer".
230
*/
231
id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
232
(uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
233
GFP_KERNEL);
234
235
if (id < 0)
236
return id;
237
ev->event_id = id;
238
239
return 0;
240
}
241
242
int kfd_event_init_process(struct kfd_process *p)
243
{
244
int id;
245
246
mutex_init(&p->event_mutex);
247
idr_init(&p->event_idr);
248
p->signal_page = NULL;
249
p->signal_event_count = 1;
250
/* Allocate event ID 0. It is used for a fast path to ignore bogus events
251
* that are sent by the CP without a context ID
252
*/
253
id = idr_alloc(&p->event_idr, NULL, 0, 1, GFP_KERNEL);
254
if (id < 0) {
255
idr_destroy(&p->event_idr);
256
mutex_destroy(&p->event_mutex);
257
return id;
258
}
259
return 0;
260
}
261
262
static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
263
{
264
struct kfd_event_waiter *waiter;
265
266
/* Wake up pending waiters. They will return failure */
267
spin_lock(&ev->lock);
268
list_for_each_entry(waiter, &ev->wq.head, wait.entry)
269
WRITE_ONCE(waiter->event, NULL);
270
wake_up_all(&ev->wq);
271
spin_unlock(&ev->lock);
272
273
if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
274
ev->type == KFD_EVENT_TYPE_DEBUG)
275
p->signal_event_count--;
276
277
idr_remove(&p->event_idr, ev->event_id);
278
kfree_rcu(ev, rcu);
279
}
280
281
static void destroy_events(struct kfd_process *p)
282
{
283
struct kfd_event *ev;
284
uint32_t id;
285
286
idr_for_each_entry(&p->event_idr, ev, id)
287
if (ev)
288
destroy_event(p, ev);
289
idr_destroy(&p->event_idr);
290
mutex_destroy(&p->event_mutex);
291
}
292
293
/*
294
* We assume that the process is being destroyed and there is no need to
295
* unmap the pages or keep bookkeeping data in order.
296
*/
297
static void shutdown_signal_page(struct kfd_process *p)
298
{
299
struct kfd_signal_page *page = p->signal_page;
300
301
if (page) {
302
if (page->need_to_free_pages)
303
free_pages((unsigned long)page->kernel_address,
304
get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
305
kfree(page);
306
}
307
}
308
309
void kfd_event_free_process(struct kfd_process *p)
310
{
311
destroy_events(p);
312
shutdown_signal_page(p);
313
}
314
315
static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
316
{
317
return ev->type == KFD_EVENT_TYPE_SIGNAL ||
318
ev->type == KFD_EVENT_TYPE_DEBUG;
319
}
320
321
static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
322
{
323
return ev->type == KFD_EVENT_TYPE_SIGNAL;
324
}
325
326
static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
327
uint64_t size, uint64_t user_handle)
328
{
329
struct kfd_signal_page *page;
330
331
if (p->signal_page)
332
return -EBUSY;
333
334
page = kzalloc(sizeof(*page), GFP_KERNEL);
335
if (!page)
336
return -ENOMEM;
337
338
/* Initialize all events to unsignaled */
339
memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
340
KFD_SIGNAL_EVENT_LIMIT * 8);
341
342
page->kernel_address = kernel_address;
343
344
p->signal_page = page;
345
p->signal_mapped_size = size;
346
p->signal_handle = user_handle;
347
return 0;
348
}
349
350
int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
351
{
352
struct kfd_node *kfd;
353
struct kfd_process_device *pdd;
354
void *mem, *kern_addr;
355
uint64_t size;
356
int err = 0;
357
358
if (p->signal_page) {
359
pr_err("Event page is already set\n");
360
return -EINVAL;
361
}
362
363
pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
364
if (!pdd) {
365
pr_err("Getting device by id failed in %s\n", __func__);
366
return -EINVAL;
367
}
368
kfd = pdd->dev;
369
370
pdd = kfd_bind_process_to_device(kfd, p);
371
if (IS_ERR(pdd))
372
return PTR_ERR(pdd);
373
374
mem = kfd_process_device_translate_handle(pdd,
375
GET_IDR_HANDLE(event_page_offset));
376
if (!mem) {
377
pr_err("Can't find BO, offset is 0x%llx\n", event_page_offset);
378
return -EINVAL;
379
}
380
381
err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(mem, &kern_addr, &size);
382
if (err) {
383
pr_err("Failed to map event page to kernel\n");
384
return err;
385
}
386
387
err = kfd_event_page_set(p, kern_addr, size, event_page_offset);
388
if (err) {
389
pr_err("Failed to set event page\n");
390
amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
391
return err;
392
}
393
return err;
394
}
395
396
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
397
uint32_t event_type, bool auto_reset, uint32_t node_id,
398
uint32_t *event_id, uint32_t *event_trigger_data,
399
uint64_t *event_page_offset, uint32_t *event_slot_index)
400
{
401
int ret = 0;
402
struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
403
404
if (!ev)
405
return -ENOMEM;
406
407
ev->type = event_type;
408
ev->auto_reset = auto_reset;
409
ev->signaled = false;
410
411
spin_lock_init(&ev->lock);
412
init_waitqueue_head(&ev->wq);
413
414
*event_page_offset = 0;
415
416
mutex_lock(&p->event_mutex);
417
418
switch (event_type) {
419
case KFD_EVENT_TYPE_SIGNAL:
420
case KFD_EVENT_TYPE_DEBUG:
421
ret = create_signal_event(devkfd, p, ev, NULL);
422
if (!ret) {
423
*event_page_offset = KFD_MMAP_TYPE_EVENTS;
424
*event_slot_index = ev->event_id;
425
}
426
break;
427
default:
428
ret = create_other_event(p, ev, NULL);
429
break;
430
}
431
432
if (!ret) {
433
*event_id = ev->event_id;
434
*event_trigger_data = ev->event_id;
435
ev->event_age = 1;
436
} else {
437
kfree(ev);
438
}
439
440
mutex_unlock(&p->event_mutex);
441
442
return ret;
443
}
444
445
int kfd_criu_restore_event(struct file *devkfd,
446
struct kfd_process *p,
447
uint8_t __user *user_priv_ptr,
448
uint64_t *priv_data_offset,
449
uint64_t max_priv_data_size)
450
{
451
struct kfd_criu_event_priv_data *ev_priv;
452
struct kfd_event *ev = NULL;
453
int ret = 0;
454
455
ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
456
if (!ev_priv)
457
return -ENOMEM;
458
459
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
460
if (!ev) {
461
ret = -ENOMEM;
462
goto exit;
463
}
464
465
if (*priv_data_offset + sizeof(*ev_priv) > max_priv_data_size) {
466
ret = -EINVAL;
467
goto exit;
468
}
469
470
ret = copy_from_user(ev_priv, user_priv_ptr + *priv_data_offset, sizeof(*ev_priv));
471
if (ret) {
472
ret = -EFAULT;
473
goto exit;
474
}
475
*priv_data_offset += sizeof(*ev_priv);
476
477
if (ev_priv->user_handle) {
478
ret = kfd_kmap_event_page(p, ev_priv->user_handle);
479
if (ret)
480
goto exit;
481
}
482
483
ev->type = ev_priv->type;
484
ev->auto_reset = ev_priv->auto_reset;
485
ev->signaled = ev_priv->signaled;
486
487
spin_lock_init(&ev->lock);
488
init_waitqueue_head(&ev->wq);
489
490
mutex_lock(&p->event_mutex);
491
switch (ev->type) {
492
case KFD_EVENT_TYPE_SIGNAL:
493
case KFD_EVENT_TYPE_DEBUG:
494
ret = create_signal_event(devkfd, p, ev, &ev_priv->event_id);
495
break;
496
case KFD_EVENT_TYPE_MEMORY:
497
memcpy(&ev->memory_exception_data,
498
&ev_priv->memory_exception_data,
499
sizeof(struct kfd_hsa_memory_exception_data));
500
501
ret = create_other_event(p, ev, &ev_priv->event_id);
502
break;
503
case KFD_EVENT_TYPE_HW_EXCEPTION:
504
memcpy(&ev->hw_exception_data,
505
&ev_priv->hw_exception_data,
506
sizeof(struct kfd_hsa_hw_exception_data));
507
508
ret = create_other_event(p, ev, &ev_priv->event_id);
509
break;
510
}
511
mutex_unlock(&p->event_mutex);
512
513
exit:
514
if (ret)
515
kfree(ev);
516
517
kfree(ev_priv);
518
519
return ret;
520
}
521
522
int kfd_criu_checkpoint_events(struct kfd_process *p,
523
uint8_t __user *user_priv_data,
524
uint64_t *priv_data_offset)
525
{
526
struct kfd_criu_event_priv_data *ev_privs;
527
int i = 0;
528
int ret = 0;
529
struct kfd_event *ev;
530
uint32_t ev_id;
531
532
uint32_t num_events = kfd_get_num_events(p);
533
534
if (!num_events)
535
return 0;
536
537
ev_privs = kvzalloc(num_events * sizeof(*ev_privs), GFP_KERNEL);
538
if (!ev_privs)
539
return -ENOMEM;
540
541
542
idr_for_each_entry(&p->event_idr, ev, ev_id) {
543
struct kfd_criu_event_priv_data *ev_priv;
544
545
/*
546
* Currently, all events have same size of private_data, but the current ioctl's
547
* and CRIU plugin supports private_data of variable sizes
548
*/
549
ev_priv = &ev_privs[i];
550
551
ev_priv->object_type = KFD_CRIU_OBJECT_TYPE_EVENT;
552
553
/* We store the user_handle with the first event */
554
if (i == 0 && p->signal_page)
555
ev_priv->user_handle = p->signal_handle;
556
557
ev_priv->event_id = ev->event_id;
558
ev_priv->auto_reset = ev->auto_reset;
559
ev_priv->type = ev->type;
560
ev_priv->signaled = ev->signaled;
561
562
if (ev_priv->type == KFD_EVENT_TYPE_MEMORY)
563
memcpy(&ev_priv->memory_exception_data,
564
&ev->memory_exception_data,
565
sizeof(struct kfd_hsa_memory_exception_data));
566
else if (ev_priv->type == KFD_EVENT_TYPE_HW_EXCEPTION)
567
memcpy(&ev_priv->hw_exception_data,
568
&ev->hw_exception_data,
569
sizeof(struct kfd_hsa_hw_exception_data));
570
571
pr_debug("Checkpointed event[%d] id = 0x%08x auto_reset = %x type = %x signaled = %x\n",
572
i,
573
ev_priv->event_id,
574
ev_priv->auto_reset,
575
ev_priv->type,
576
ev_priv->signaled);
577
i++;
578
}
579
580
ret = copy_to_user(user_priv_data + *priv_data_offset,
581
ev_privs, num_events * sizeof(*ev_privs));
582
if (ret) {
583
pr_err("Failed to copy events priv to user\n");
584
ret = -EFAULT;
585
}
586
587
*priv_data_offset += num_events * sizeof(*ev_privs);
588
589
kvfree(ev_privs);
590
return ret;
591
}
592
593
int kfd_get_num_events(struct kfd_process *p)
594
{
595
struct kfd_event *ev;
596
uint32_t id;
597
u32 num_events = 0;
598
599
idr_for_each_entry(&p->event_idr, ev, id)
600
num_events++;
601
602
return num_events;
603
}
604
605
/* Assumes that p is current. */
606
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
607
{
608
struct kfd_event *ev;
609
int ret = 0;
610
611
mutex_lock(&p->event_mutex);
612
613
ev = lookup_event_by_id(p, event_id);
614
615
if (ev)
616
destroy_event(p, ev);
617
else
618
ret = -EINVAL;
619
620
mutex_unlock(&p->event_mutex);
621
return ret;
622
}
623
624
static void set_event(struct kfd_event *ev)
625
{
626
struct kfd_event_waiter *waiter;
627
628
/* Auto reset if the list is non-empty and we're waking
629
* someone. waitqueue_active is safe here because we're
630
* protected by the ev->lock, which is also held when
631
* updating the wait queues in kfd_wait_on_events.
632
*/
633
ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
634
if (!(++ev->event_age)) {
635
/* Never wrap back to reserved/default event age 0/1 */
636
ev->event_age = 2;
637
WARN_ONCE(1, "event_age wrap back!");
638
}
639
640
list_for_each_entry(waiter, &ev->wq.head, wait.entry)
641
WRITE_ONCE(waiter->activated, true);
642
643
wake_up_all(&ev->wq);
644
}
645
646
/* Assumes that p is current. */
647
int kfd_set_event(struct kfd_process *p, uint32_t event_id)
648
{
649
int ret = 0;
650
struct kfd_event *ev;
651
652
rcu_read_lock();
653
654
ev = lookup_event_by_id(p, event_id);
655
if (!ev) {
656
ret = -EINVAL;
657
goto unlock_rcu;
658
}
659
spin_lock(&ev->lock);
660
661
if (event_can_be_cpu_signaled(ev))
662
set_event(ev);
663
else
664
ret = -EINVAL;
665
666
spin_unlock(&ev->lock);
667
unlock_rcu:
668
rcu_read_unlock();
669
return ret;
670
}
671
672
static void reset_event(struct kfd_event *ev)
673
{
674
ev->signaled = false;
675
}
676
677
/* Assumes that p is current. */
678
int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
679
{
680
int ret = 0;
681
struct kfd_event *ev;
682
683
rcu_read_lock();
684
685
ev = lookup_event_by_id(p, event_id);
686
if (!ev) {
687
ret = -EINVAL;
688
goto unlock_rcu;
689
}
690
spin_lock(&ev->lock);
691
692
if (event_can_be_cpu_signaled(ev))
693
reset_event(ev);
694
else
695
ret = -EINVAL;
696
697
spin_unlock(&ev->lock);
698
unlock_rcu:
699
rcu_read_unlock();
700
return ret;
701
702
}
703
704
static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
705
{
706
WRITE_ONCE(page_slots(p->signal_page)[ev->event_id], UNSIGNALED_EVENT_SLOT);
707
}
708
709
static void set_event_from_interrupt(struct kfd_process *p,
710
struct kfd_event *ev)
711
{
712
if (ev && event_can_be_gpu_signaled(ev)) {
713
acknowledge_signal(p, ev);
714
spin_lock(&ev->lock);
715
set_event(ev);
716
spin_unlock(&ev->lock);
717
}
718
}
719
720
void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
721
uint32_t valid_id_bits)
722
{
723
struct kfd_event *ev = NULL;
724
725
/*
726
* Because we are called from arbitrary context (workqueue) as opposed
727
* to process context, kfd_process could attempt to exit while we are
728
* running so the lookup function increments the process ref count.
729
*/
730
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
731
732
if (!p)
733
return; /* Presumably process exited. */
734
735
rcu_read_lock();
736
737
if (valid_id_bits)
738
ev = lookup_signaled_event_by_partial_id(p, partial_id,
739
valid_id_bits);
740
if (ev) {
741
set_event_from_interrupt(p, ev);
742
} else if (p->signal_page) {
743
/*
744
* Partial ID lookup failed. Assume that the event ID
745
* in the interrupt payload was invalid and do an
746
* exhaustive search of signaled events.
747
*/
748
uint64_t *slots = page_slots(p->signal_page);
749
uint32_t id;
750
751
/*
752
* If id is valid but slot is not signaled, GPU may signal the same event twice
753
* before driver have chance to process the first interrupt, then signal slot is
754
* auto-reset after set_event wakeup the user space, just drop the second event as
755
* the application only need wakeup once.
756
*/
757
if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
758
partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
759
goto out_unlock;
760
761
if (valid_id_bits)
762
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
763
partial_id, valid_id_bits);
764
765
if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
766
/* With relatively few events, it's faster to
767
* iterate over the event IDR
768
*/
769
idr_for_each_entry(&p->event_idr, ev, id) {
770
if (id >= KFD_SIGNAL_EVENT_LIMIT)
771
break;
772
773
if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT)
774
set_event_from_interrupt(p, ev);
775
}
776
} else {
777
/* With relatively many events, it's faster to
778
* iterate over the signal slots and lookup
779
* only signaled events from the IDR.
780
*/
781
for (id = 1; id < KFD_SIGNAL_EVENT_LIMIT; id++)
782
if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT) {
783
ev = lookup_event_by_id(p, id);
784
set_event_from_interrupt(p, ev);
785
}
786
}
787
}
788
789
out_unlock:
790
rcu_read_unlock();
791
kfd_unref_process(p);
792
}
793
794
static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
795
{
796
struct kfd_event_waiter *event_waiters;
797
uint32_t i;
798
799
event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter),
800
GFP_KERNEL);
801
if (!event_waiters)
802
return NULL;
803
804
for (i = 0; i < num_events; i++)
805
init_wait(&event_waiters[i].wait);
806
807
return event_waiters;
808
}
809
810
static int init_event_waiter(struct kfd_process *p,
811
struct kfd_event_waiter *waiter,
812
struct kfd_event_data *event_data)
813
{
814
struct kfd_event *ev = lookup_event_by_id(p, event_data->event_id);
815
816
if (!ev)
817
return -EINVAL;
818
819
spin_lock(&ev->lock);
820
waiter->event = ev;
821
waiter->activated = ev->signaled;
822
ev->signaled = ev->signaled && !ev->auto_reset;
823
824
/* last_event_age = 0 reserved for backward compatible */
825
if (waiter->event->type == KFD_EVENT_TYPE_SIGNAL &&
826
event_data->signal_event_data.last_event_age) {
827
waiter->event_age_enabled = true;
828
if (ev->event_age != event_data->signal_event_data.last_event_age)
829
waiter->activated = true;
830
}
831
832
if (!waiter->activated)
833
add_wait_queue(&ev->wq, &waiter->wait);
834
spin_unlock(&ev->lock);
835
836
return 0;
837
}
838
839
/* test_event_condition - Test condition of events being waited for
840
* @all: Return completion only if all events have signaled
841
* @num_events: Number of events to wait for
842
* @event_waiters: Array of event waiters, one per event
843
*
844
* Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
845
* signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
846
* events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
847
* the events have been destroyed.
848
*/
849
static uint32_t test_event_condition(bool all, uint32_t num_events,
850
struct kfd_event_waiter *event_waiters)
851
{
852
uint32_t i;
853
uint32_t activated_count = 0;
854
855
for (i = 0; i < num_events; i++) {
856
if (!READ_ONCE(event_waiters[i].event))
857
return KFD_IOC_WAIT_RESULT_FAIL;
858
859
if (READ_ONCE(event_waiters[i].activated)) {
860
if (!all)
861
return KFD_IOC_WAIT_RESULT_COMPLETE;
862
863
activated_count++;
864
}
865
}
866
867
return activated_count == num_events ?
868
KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
869
}
870
871
/*
872
* Copy event specific data, if defined.
873
* Currently only memory exception events have additional data to copy to user
874
*/
875
static int copy_signaled_event_data(uint32_t num_events,
876
struct kfd_event_waiter *event_waiters,
877
struct kfd_event_data __user *data)
878
{
879
void *src;
880
void __user *dst;
881
struct kfd_event_waiter *waiter;
882
struct kfd_event *event;
883
uint32_t i, size = 0;
884
885
for (i = 0; i < num_events; i++) {
886
waiter = &event_waiters[i];
887
event = waiter->event;
888
if (!event)
889
return -EINVAL; /* event was destroyed */
890
if (waiter->activated) {
891
if (event->type == KFD_EVENT_TYPE_MEMORY) {
892
dst = &data[i].memory_exception_data;
893
src = &event->memory_exception_data;
894
size = sizeof(struct kfd_hsa_memory_exception_data);
895
} else if (event->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
896
dst = &data[i].memory_exception_data;
897
src = &event->hw_exception_data;
898
size = sizeof(struct kfd_hsa_hw_exception_data);
899
} else if (event->type == KFD_EVENT_TYPE_SIGNAL &&
900
waiter->event_age_enabled) {
901
dst = &data[i].signal_event_data.last_event_age;
902
src = &event->event_age;
903
size = sizeof(u64);
904
}
905
if (size && copy_to_user(dst, src, size))
906
return -EFAULT;
907
}
908
}
909
910
return 0;
911
}
912
913
static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
914
{
915
if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
916
return 0;
917
918
if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
919
return MAX_SCHEDULE_TIMEOUT;
920
921
/*
922
* msecs_to_jiffies interprets all values above 2^31-1 as infinite,
923
* but we consider them finite.
924
* This hack is wrong, but nobody is likely to notice.
925
*/
926
user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
927
928
return msecs_to_jiffies(user_timeout_ms) + 1;
929
}
930
931
static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters,
932
bool undo_auto_reset)
933
{
934
uint32_t i;
935
936
for (i = 0; i < num_events; i++)
937
if (waiters[i].event) {
938
spin_lock(&waiters[i].event->lock);
939
remove_wait_queue(&waiters[i].event->wq,
940
&waiters[i].wait);
941
if (undo_auto_reset && waiters[i].activated &&
942
waiters[i].event && waiters[i].event->auto_reset)
943
set_event(waiters[i].event);
944
spin_unlock(&waiters[i].event->lock);
945
}
946
947
kfree(waiters);
948
}
949
950
int kfd_wait_on_events(struct kfd_process *p,
951
uint32_t num_events, void __user *data,
952
bool all, uint32_t *user_timeout_ms,
953
uint32_t *wait_result)
954
{
955
struct kfd_event_data __user *events =
956
(struct kfd_event_data __user *) data;
957
uint32_t i;
958
int ret = 0;
959
960
struct kfd_event_waiter *event_waiters = NULL;
961
long timeout = user_timeout_to_jiffies(*user_timeout_ms);
962
963
event_waiters = alloc_event_waiters(num_events);
964
if (!event_waiters) {
965
ret = -ENOMEM;
966
goto out;
967
}
968
969
/* Use p->event_mutex here to protect against concurrent creation and
970
* destruction of events while we initialize event_waiters.
971
*/
972
mutex_lock(&p->event_mutex);
973
974
for (i = 0; i < num_events; i++) {
975
struct kfd_event_data event_data;
976
977
if (copy_from_user(&event_data, &events[i],
978
sizeof(struct kfd_event_data))) {
979
ret = -EFAULT;
980
goto out_unlock;
981
}
982
983
ret = init_event_waiter(p, &event_waiters[i], &event_data);
984
if (ret)
985
goto out_unlock;
986
}
987
988
/* Check condition once. */
989
*wait_result = test_event_condition(all, num_events, event_waiters);
990
if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
991
ret = copy_signaled_event_data(num_events,
992
event_waiters, events);
993
goto out_unlock;
994
} else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
995
/* This should not happen. Events shouldn't be
996
* destroyed while we're holding the event_mutex
997
*/
998
goto out_unlock;
999
}
1000
1001
mutex_unlock(&p->event_mutex);
1002
1003
while (true) {
1004
if (fatal_signal_pending(current)) {
1005
ret = -EINTR;
1006
break;
1007
}
1008
1009
if (signal_pending(current)) {
1010
ret = -ERESTARTSYS;
1011
if (*user_timeout_ms != KFD_EVENT_TIMEOUT_IMMEDIATE &&
1012
*user_timeout_ms != KFD_EVENT_TIMEOUT_INFINITE)
1013
*user_timeout_ms = jiffies_to_msecs(
1014
max(0l, timeout-1));
1015
break;
1016
}
1017
1018
/* Set task state to interruptible sleep before
1019
* checking wake-up conditions. A concurrent wake-up
1020
* will put the task back into runnable state. In that
1021
* case schedule_timeout will not put the task to
1022
* sleep and we'll get a chance to re-check the
1023
* updated conditions almost immediately. Otherwise,
1024
* this race condition would lead to a soft hang or a
1025
* very long sleep.
1026
*/
1027
set_current_state(TASK_INTERRUPTIBLE);
1028
1029
*wait_result = test_event_condition(all, num_events,
1030
event_waiters);
1031
if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
1032
break;
1033
1034
if (timeout <= 0)
1035
break;
1036
1037
timeout = schedule_timeout(timeout);
1038
}
1039
__set_current_state(TASK_RUNNING);
1040
1041
mutex_lock(&p->event_mutex);
1042
/* copy_signaled_event_data may sleep. So this has to happen
1043
* after the task state is set back to RUNNING.
1044
*
1045
* The event may also have been destroyed after signaling. So
1046
* copy_signaled_event_data also must confirm that the event
1047
* still exists. Therefore this must be under the p->event_mutex
1048
* which is also held when events are destroyed.
1049
*/
1050
if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
1051
ret = copy_signaled_event_data(num_events,
1052
event_waiters, events);
1053
1054
out_unlock:
1055
free_waiters(num_events, event_waiters, ret == -ERESTARTSYS);
1056
mutex_unlock(&p->event_mutex);
1057
out:
1058
if (ret)
1059
*wait_result = KFD_IOC_WAIT_RESULT_FAIL;
1060
else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
1061
ret = -EIO;
1062
1063
return ret;
1064
}
1065
1066
int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
1067
{
1068
unsigned long pfn;
1069
struct kfd_signal_page *page;
1070
int ret;
1071
1072
/* check required size doesn't exceed the allocated size */
1073
if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
1074
get_order(vma->vm_end - vma->vm_start)) {
1075
pr_err("Event page mmap requested illegal size\n");
1076
return -EINVAL;
1077
}
1078
1079
page = p->signal_page;
1080
if (!page) {
1081
/* Probably KFD bug, but mmap is user-accessible. */
1082
pr_debug("Signal page could not be found\n");
1083
return -EINVAL;
1084
}
1085
1086
pfn = __pa(page->kernel_address);
1087
pfn >>= PAGE_SHIFT;
1088
1089
vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE
1090
| VM_DONTDUMP | VM_PFNMAP);
1091
1092
pr_debug("Mapping signal page\n");
1093
pr_debug(" start user address == 0x%08lx\n", vma->vm_start);
1094
pr_debug(" end user address == 0x%08lx\n", vma->vm_end);
1095
pr_debug(" pfn == 0x%016lX\n", pfn);
1096
pr_debug(" vm_flags == 0x%08lX\n", vma->vm_flags);
1097
pr_debug(" size == 0x%08lX\n",
1098
vma->vm_end - vma->vm_start);
1099
1100
page->user_address = (uint64_t __user *)vma->vm_start;
1101
1102
/* mapping the page to user process */
1103
ret = remap_pfn_range(vma, vma->vm_start, pfn,
1104
vma->vm_end - vma->vm_start, vma->vm_page_prot);
1105
if (!ret)
1106
p->signal_mapped_size = vma->vm_end - vma->vm_start;
1107
1108
return ret;
1109
}
1110
1111
/*
1112
* Assumes that p is not going away.
1113
*/
1114
static void lookup_events_by_type_and_signal(struct kfd_process *p,
1115
int type, void *event_data)
1116
{
1117
struct kfd_hsa_memory_exception_data *ev_data;
1118
struct kfd_event *ev;
1119
uint32_t id;
1120
bool send_signal = true;
1121
1122
ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
1123
1124
rcu_read_lock();
1125
1126
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1127
idr_for_each_entry_continue(&p->event_idr, ev, id)
1128
if (ev->type == type) {
1129
send_signal = false;
1130
dev_dbg(kfd_device,
1131
"Event found: id %X type %d",
1132
ev->event_id, ev->type);
1133
spin_lock(&ev->lock);
1134
set_event(ev);
1135
if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
1136
ev->memory_exception_data = *ev_data;
1137
spin_unlock(&ev->lock);
1138
}
1139
1140
if (type == KFD_EVENT_TYPE_MEMORY) {
1141
dev_warn(kfd_device,
1142
"Sending SIGSEGV to process pid %d",
1143
p->lead_thread->pid);
1144
send_sig(SIGSEGV, p->lead_thread, 0);
1145
}
1146
1147
/* Send SIGTERM no event of type "type" has been found*/
1148
if (send_signal) {
1149
if (send_sigterm) {
1150
dev_warn(kfd_device,
1151
"Sending SIGTERM to process pid %d",
1152
p->lead_thread->pid);
1153
send_sig(SIGTERM, p->lead_thread, 0);
1154
} else {
1155
dev_err(kfd_device,
1156
"Process pid %d got unhandled exception",
1157
p->lead_thread->pid);
1158
}
1159
}
1160
1161
rcu_read_unlock();
1162
}
1163
1164
void kfd_signal_hw_exception_event(u32 pasid)
1165
{
1166
/*
1167
* Because we are called from arbitrary context (workqueue) as opposed
1168
* to process context, kfd_process could attempt to exit while we are
1169
* running so the lookup function increments the process ref count.
1170
*/
1171
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
1172
1173
if (!p)
1174
return; /* Presumably process exited. */
1175
1176
lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
1177
kfd_unref_process(p);
1178
}
1179
1180
void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
1181
{
1182
struct kfd_process_device *pdd;
1183
struct kfd_hsa_memory_exception_data exception_data;
1184
int i;
1185
1186
memset(&exception_data, 0, sizeof(exception_data));
1187
exception_data.va = gpu_va;
1188
exception_data.failure.NotPresent = 1;
1189
1190
// Send VM seg fault to all kfd process device
1191
for (i = 0; i < p->n_pdds; i++) {
1192
pdd = p->pdds[i];
1193
exception_data.gpu_id = pdd->user_gpu_id;
1194
kfd_evict_process_device(pdd);
1195
kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
1196
}
1197
}
1198
1199
void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
1200
struct kfd_vm_fault_info *info,
1201
struct kfd_hsa_memory_exception_data *data)
1202
{
1203
struct kfd_event *ev;
1204
uint32_t id;
1205
struct kfd_process *p = pdd->process;
1206
struct kfd_hsa_memory_exception_data memory_exception_data;
1207
int user_gpu_id;
1208
1209
user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
1210
if (unlikely(user_gpu_id == -EINVAL)) {
1211
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
1212
pdd->dev->id);
1213
return;
1214
}
1215
1216
/* SoC15 chips and onwards will pass in data from now on. */
1217
if (!data) {
1218
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1219
memory_exception_data.gpu_id = user_gpu_id;
1220
memory_exception_data.failure.imprecise = true;
1221
1222
/* Set failure reason */
1223
if (info) {
1224
memory_exception_data.va = (info->page_addr) <<
1225
PAGE_SHIFT;
1226
memory_exception_data.failure.NotPresent =
1227
info->prot_valid ? 1 : 0;
1228
memory_exception_data.failure.NoExecute =
1229
info->prot_exec ? 1 : 0;
1230
memory_exception_data.failure.ReadOnly =
1231
info->prot_write ? 1 : 0;
1232
memory_exception_data.failure.imprecise = 0;
1233
}
1234
}
1235
1236
rcu_read_lock();
1237
1238
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1239
idr_for_each_entry_continue(&p->event_idr, ev, id)
1240
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1241
spin_lock(&ev->lock);
1242
ev->memory_exception_data = data ? *data :
1243
memory_exception_data;
1244
set_event(ev);
1245
spin_unlock(&ev->lock);
1246
}
1247
1248
rcu_read_unlock();
1249
}
1250
1251
void kfd_signal_reset_event(struct kfd_node *dev)
1252
{
1253
struct kfd_hsa_hw_exception_data hw_exception_data;
1254
struct kfd_hsa_memory_exception_data memory_exception_data;
1255
struct kfd_process *p;
1256
struct kfd_event *ev;
1257
unsigned int temp;
1258
uint32_t id, idx;
1259
int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1260
KFD_HW_EXCEPTION_ECC :
1261
KFD_HW_EXCEPTION_GPU_HANG;
1262
1263
/* Whole gpu reset caused by GPU hang and memory is lost */
1264
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1265
hw_exception_data.memory_lost = 1;
1266
hw_exception_data.reset_cause = reset_cause;
1267
1268
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1269
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1270
memory_exception_data.failure.imprecise = true;
1271
1272
idx = srcu_read_lock(&kfd_processes_srcu);
1273
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
1274
int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1275
struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
1276
1277
if (unlikely(user_gpu_id == -EINVAL)) {
1278
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1279
continue;
1280
}
1281
1282
if (unlikely(!pdd)) {
1283
WARN_ONCE(1, "Could not get device data from process pid:%d\n",
1284
p->lead_thread->pid);
1285
continue;
1286
}
1287
1288
if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
1289
continue;
1290
1291
if (dev->dqm->detect_hang_count) {
1292
struct amdgpu_task_info *ti;
1293
struct amdgpu_fpriv *drv_priv;
1294
1295
if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) {
1296
WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n",
1297
dev->id, p->lead_thread->pid);
1298
continue;
1299
}
1300
1301
ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
1302
if (ti) {
1303
dev_err(dev->adev->dev,
1304
"Queues reset on process %s tid %d thread %s pid %d\n",
1305
ti->process_name, ti->tgid, ti->task.comm, ti->task.pid);
1306
amdgpu_vm_put_task_info(ti);
1307
}
1308
}
1309
1310
rcu_read_lock();
1311
1312
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1313
idr_for_each_entry_continue(&p->event_idr, ev, id) {
1314
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1315
spin_lock(&ev->lock);
1316
ev->hw_exception_data = hw_exception_data;
1317
ev->hw_exception_data.gpu_id = user_gpu_id;
1318
set_event(ev);
1319
spin_unlock(&ev->lock);
1320
}
1321
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1322
reset_cause == KFD_HW_EXCEPTION_ECC) {
1323
spin_lock(&ev->lock);
1324
ev->memory_exception_data = memory_exception_data;
1325
ev->memory_exception_data.gpu_id = user_gpu_id;
1326
set_event(ev);
1327
spin_unlock(&ev->lock);
1328
}
1329
}
1330
1331
rcu_read_unlock();
1332
}
1333
srcu_read_unlock(&kfd_processes_srcu, idx);
1334
}
1335
1336
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
1337
{
1338
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
1339
struct kfd_hsa_memory_exception_data memory_exception_data;
1340
struct kfd_hsa_hw_exception_data hw_exception_data;
1341
struct kfd_event *ev;
1342
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1343
int user_gpu_id;
1344
1345
if (!p) {
1346
dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", pasid);
1347
return; /* Presumably process exited. */
1348
}
1349
1350
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1351
if (unlikely(user_gpu_id == -EINVAL)) {
1352
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1353
kfd_unref_process(p);
1354
return;
1355
}
1356
1357
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1358
hw_exception_data.gpu_id = user_gpu_id;
1359
hw_exception_data.memory_lost = 1;
1360
hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
1361
1362
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1363
memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
1364
memory_exception_data.gpu_id = user_gpu_id;
1365
memory_exception_data.failure.imprecise = true;
1366
1367
rcu_read_lock();
1368
1369
idr_for_each_entry_continue(&p->event_idr, ev, id) {
1370
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1371
spin_lock(&ev->lock);
1372
ev->hw_exception_data = hw_exception_data;
1373
set_event(ev);
1374
spin_unlock(&ev->lock);
1375
}
1376
1377
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1378
spin_lock(&ev->lock);
1379
ev->memory_exception_data = memory_exception_data;
1380
set_event(ev);
1381
spin_unlock(&ev->lock);
1382
}
1383
}
1384
1385
dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
1386
p->lead_thread->comm, pasid);
1387
rcu_read_unlock();
1388
1389
/* user application will handle SIGBUS signal */
1390
send_sig(SIGBUS, p->lead_thread, 0);
1391
1392
kfd_unref_process(p);
1393
}
1394
1395