CoCalc -- kfd_events.c

GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_events.c
²⁶⁵¹⁶ views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
 * Copyright 2014-2022 Advanced Micro Devices, Inc.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
 * OTHER DEALINGS IN THE SOFTWARE.
22
 */
23

24
#include <linux/mm_types.h>
25
#include <linux/slab.h>
26
#include <linux/types.h>
27
#include <linux/sched/signal.h>
28
#include <linux/sched/mm.h>
29
#include <linux/uaccess.h>
30
#include <linux/mman.h>
31
#include <linux/memory.h>
32
#include "kfd_priv.h"
33
#include "kfd_events.h"
34
#include "kfd_device_queue_manager.h"
35
#include <linux/device.h>
36

37
/*
38
 * Wrapper around wait_queue_entry_t
39
 */
40
struct kfd_event_waiter {
41
	wait_queue_entry_t wait;
42
	struct kfd_event *event; /* Event to wait for */
43
	bool activated;		 /* Becomes true when event is signaled */
44
	bool event_age_enabled;  /* set to true when last_event_age is non-zero */
45
};
46

47
/*
48
 * Each signal event needs a 64-bit signal slot where the signaler will write
49
 * a 1 before sending an interrupt. (This is needed because some interrupts
50
 * do not contain enough spare data bits to identify an event.)
51
 * We get whole pages and map them to the process VA.
52
 * Individual signal events use their event_id as slot index.
53
 */
54
struct kfd_signal_page {
55
	uint64_t *kernel_address;
56
	uint64_t __user *user_address;
57
	bool need_to_free_pages;
58
};
59

60
static uint64_t *page_slots(struct kfd_signal_page *page)
61
{
62
	return page->kernel_address;
63
}
64

65
static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
66
{
67
	void *backing_store;
68
	struct kfd_signal_page *page;
69

70
	page = kzalloc(sizeof(*page), GFP_KERNEL);
71
	if (!page)
72
		return NULL;
73

74
	backing_store = (void *) __get_free_pages(GFP_KERNEL,
75
					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
76
	if (!backing_store)
77
		goto fail_alloc_signal_store;
78

79
	/* Initialize all events to unsignaled */
80
	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
81
	       KFD_SIGNAL_EVENT_LIMIT * 8);
82

83
	page->kernel_address = backing_store;
84
	page->need_to_free_pages = true;
85
	pr_debug("Allocated new event signal page at %p, for process %p\n",
86
			page, p);
87

88
	return page;
89

90
fail_alloc_signal_store:
91
	kfree(page);
92
	return NULL;
93
}
94

95
static int allocate_event_notification_slot(struct kfd_process *p,
96
					    struct kfd_event *ev,
97
					    const int *restore_id)
98
{
99
	int id;
100

101
	if (!p->signal_page) {
102
		p->signal_page = allocate_signal_page(p);
103
		if (!p->signal_page)
104
			return -ENOMEM;
105
		/* Oldest user mode expects 256 event slots */
106
		p->signal_mapped_size = 256*8;
107
	}
108

109
	if (restore_id) {
110
		id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
111
				GFP_KERNEL);
112
	} else {
113
		/*
114
		 * Compatibility with old user mode: Only use signal slots
115
		 * user mode has mapped, may be less than
116
		 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
117
		 * of the event limit without breaking user mode.
118
		 */
119
		id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
120
				GFP_KERNEL);
121
	}
122
	if (id < 0)
123
		return id;
124

125
	ev->event_id = id;
126
	page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
127

128
	return 0;
129
}
130

131
/*
132
 * Assumes that p->event_mutex or rcu_readlock is held and of course that p is
133
 * not going away.
134
 */
135
static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
136
{
137
	return idr_find(&p->event_idr, id);
138
}
139

140
/**
141
 * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
142
 * @p:     Pointer to struct kfd_process
143
 * @id:    ID to look up
144
 * @bits:  Number of valid bits in @id
145
 *
146
 * Finds the first signaled event with a matching partial ID. If no
147
 * matching signaled event is found, returns NULL. In that case the
148
 * caller should assume that the partial ID is invalid and do an
149
 * exhaustive search of all siglaned events.
150
 *
151
 * If multiple events with the same partial ID signal at the same
152
 * time, they will be found one interrupt at a time, not necessarily
153
 * in the same order the interrupts occurred. As long as the number of
154
 * interrupts is correct, all signaled events will be seen by the
155
 * driver.
156
 */
157
static struct kfd_event *lookup_signaled_event_by_partial_id(
158
	struct kfd_process *p, uint32_t id, uint32_t bits)
159
{
160
	struct kfd_event *ev;
161

162
	if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
163
		return NULL;
164

165
	/* Fast path for the common case that @id is not a partial ID
166
	 * and we only need a single lookup.
167
	 */
168
	if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
169
		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
170
			return NULL;
171

172
		return idr_find(&p->event_idr, id);
173
	}
174

175
	/* General case for partial IDs: Iterate over all matching IDs
176
	 * and find the first one that has signaled.
177
	 */
178
	for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
179
		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
180
			continue;
181

182
		ev = idr_find(&p->event_idr, id);
183
	}
184

185
	return ev;
186
}
187

188
static int create_signal_event(struct file *devkfd, struct kfd_process *p,
189
				struct kfd_event *ev, const int *restore_id)
190
{
191
	int ret;
192

193
	if (p->signal_mapped_size &&
194
	    p->signal_event_count == p->signal_mapped_size / 8) {
195
		if (!p->signal_event_limit_reached) {
196
			pr_debug("Signal event wasn't created because limit was reached\n");
197
			p->signal_event_limit_reached = true;
198
		}
199
		return -ENOSPC;
200
	}
201

202
	ret = allocate_event_notification_slot(p, ev, restore_id);
203
	if (ret) {
204
		pr_warn("Signal event wasn't created because out of kernel memory\n");
205
		return ret;
206
	}
207

208
	p->signal_event_count++;
209

210
	ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
211
	pr_debug("Signal event number %zu created with id %d, address %p\n",
212
			p->signal_event_count, ev->event_id,
213
			ev->user_signal_address);
214

215
	return 0;
216
}
217

218
static int create_other_event(struct kfd_process *p, struct kfd_event *ev, const int *restore_id)
219
{
220
	int id;
221

222
	if (restore_id)
223
		id = idr_alloc(&p->event_idr, ev, *restore_id, *restore_id + 1,
224
			GFP_KERNEL);
225
	else
226
		/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
227
		 * intentional integer overflow to -1 without a compiler
228
		 * warning. idr_alloc treats a negative value as "maximum
229
		 * signed integer".
230
		 */
231
		id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
232
				(uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
233
				GFP_KERNEL);
234

235
	if (id < 0)
236
		return id;
237
	ev->event_id = id;
238

239
	return 0;
240
}
241

242
int kfd_event_init_process(struct kfd_process *p)
243
{
244
	int id;
245

246
	mutex_init(&p->event_mutex);
247
	idr_init(&p->event_idr);
248
	p->signal_page = NULL;
249
	p->signal_event_count = 1;
250
	/* Allocate event ID 0. It is used for a fast path to ignore bogus events
251
	 * that are sent by the CP without a context ID
252
	 */
253
	id = idr_alloc(&p->event_idr, NULL, 0, 1, GFP_KERNEL);
254
	if (id < 0) {
255
		idr_destroy(&p->event_idr);
256
		mutex_destroy(&p->event_mutex);
257
		return id;
258
	}
259
	return 0;
260
}
261

262
static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
263
{
264
	struct kfd_event_waiter *waiter;
265

266
	/* Wake up pending waiters. They will return failure */
267
	spin_lock(&ev->lock);
268
	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
269
		WRITE_ONCE(waiter->event, NULL);
270
	wake_up_all(&ev->wq);
271
	spin_unlock(&ev->lock);
272

273
	if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
274
	    ev->type == KFD_EVENT_TYPE_DEBUG)
275
		p->signal_event_count--;
276

277
	idr_remove(&p->event_idr, ev->event_id);
278
	kfree_rcu(ev, rcu);
279
}
280

281
static void destroy_events(struct kfd_process *p)
282
{
283
	struct kfd_event *ev;
284
	uint32_t id;
285

286
	idr_for_each_entry(&p->event_idr, ev, id)
287
		if (ev)
288
			destroy_event(p, ev);
289
	idr_destroy(&p->event_idr);
290
	mutex_destroy(&p->event_mutex);
291
}
292

293
/*
294
 * We assume that the process is being destroyed and there is no need to
295
 * unmap the pages or keep bookkeeping data in order.
296
 */
297
static void shutdown_signal_page(struct kfd_process *p)
298
{
299
	struct kfd_signal_page *page = p->signal_page;
300

301
	if (page) {
302
		if (page->need_to_free_pages)
303
			free_pages((unsigned long)page->kernel_address,
304
				   get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
305
		kfree(page);
306
	}
307
}
308

309
void kfd_event_free_process(struct kfd_process *p)
310
{
311
	destroy_events(p);
312
	shutdown_signal_page(p);
313
}
314

315
static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
316
{
317
	return ev->type == KFD_EVENT_TYPE_SIGNAL ||
318
					ev->type == KFD_EVENT_TYPE_DEBUG;
319
}
320

321
static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
322
{
323
	return ev->type == KFD_EVENT_TYPE_SIGNAL;
324
}
325

326
static int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
327
		       uint64_t size, uint64_t user_handle)
328
{
329
	struct kfd_signal_page *page;
330

331
	if (p->signal_page)
332
		return -EBUSY;
333

334
	page = kzalloc(sizeof(*page), GFP_KERNEL);
335
	if (!page)
336
		return -ENOMEM;
337

338
	/* Initialize all events to unsignaled */
339
	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
340
	       KFD_SIGNAL_EVENT_LIMIT * 8);
341

342
	page->kernel_address = kernel_address;
343

344
	p->signal_page = page;
345
	p->signal_mapped_size = size;
346
	p->signal_handle = user_handle;
347
	return 0;
348
}
349

350
int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
351
{
352
	struct kfd_node *kfd;
353
	struct kfd_process_device *pdd;
354
	void *mem, *kern_addr;
355
	uint64_t size;
356
	int err = 0;
357

358
	if (p->signal_page) {
359
		pr_err("Event page is already set\n");
360
		return -EINVAL;
361
	}
362

363
	pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
364
	if (!pdd) {
365
		pr_err("Getting device by id failed in %s\n", __func__);
366
		return -EINVAL;
367
	}
368
	kfd = pdd->dev;
369

370
	pdd = kfd_bind_process_to_device(kfd, p);
371
	if (IS_ERR(pdd))
372
		return PTR_ERR(pdd);
373

374
	mem = kfd_process_device_translate_handle(pdd,
375
			GET_IDR_HANDLE(event_page_offset));
376
	if (!mem) {
377
		pr_err("Can't find BO, offset is 0x%llx\n", event_page_offset);
378
		return -EINVAL;
379
	}
380

381
	err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(mem, &kern_addr, &size);
382
	if (err) {
383
		pr_err("Failed to map event page to kernel\n");
384
		return err;
385
	}
386

387
	err = kfd_event_page_set(p, kern_addr, size, event_page_offset);
388
	if (err) {
389
		pr_err("Failed to set event page\n");
390
		amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(mem);
391
		return err;
392
	}
393
	return err;
394
}
395

396
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
397
		     uint32_t event_type, bool auto_reset, uint32_t node_id,
398
		     uint32_t *event_id, uint32_t *event_trigger_data,
399
		     uint64_t *event_page_offset, uint32_t *event_slot_index)
400
{
401
	int ret = 0;
402
	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
403

404
	if (!ev)
405
		return -ENOMEM;
406

407
	ev->type = event_type;
408
	ev->auto_reset = auto_reset;
409
	ev->signaled = false;
410

411
	spin_lock_init(&ev->lock);
412
	init_waitqueue_head(&ev->wq);
413

414
	*event_page_offset = 0;
415

416
	mutex_lock(&p->event_mutex);
417

418
	switch (event_type) {
419
	case KFD_EVENT_TYPE_SIGNAL:
420
	case KFD_EVENT_TYPE_DEBUG:
421
		ret = create_signal_event(devkfd, p, ev, NULL);
422
		if (!ret) {
423
			*event_page_offset = KFD_MMAP_TYPE_EVENTS;
424
			*event_slot_index = ev->event_id;
425
		}
426
		break;
427
	default:
428
		ret = create_other_event(p, ev, NULL);
429
		break;
430
	}
431

432
	if (!ret) {
433
		*event_id = ev->event_id;
434
		*event_trigger_data = ev->event_id;
435
		ev->event_age = 1;
436
	} else {
437
		kfree(ev);
438
	}
439

440
	mutex_unlock(&p->event_mutex);
441

442
	return ret;
443
}
444

445
int kfd_criu_restore_event(struct file *devkfd,
446
			   struct kfd_process *p,
447
			   uint8_t __user *user_priv_ptr,
448
			   uint64_t *priv_data_offset,
449
			   uint64_t max_priv_data_size)
450
{
451
	struct kfd_criu_event_priv_data *ev_priv;
452
	struct kfd_event *ev = NULL;
453
	int ret = 0;
454

455
	ev_priv = kmalloc(sizeof(*ev_priv), GFP_KERNEL);
456
	if (!ev_priv)
457
		return -ENOMEM;
458

459
	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
460
	if (!ev) {
461
		ret = -ENOMEM;
462
		goto exit;
463
	}
464

465
	if (*priv_data_offset + sizeof(*ev_priv) > max_priv_data_size) {
466
		ret = -EINVAL;
467
		goto exit;
468
	}
469

470
	ret = copy_from_user(ev_priv, user_priv_ptr + *priv_data_offset, sizeof(*ev_priv));
471
	if (ret) {
472
		ret = -EFAULT;
473
		goto exit;
474
	}
475
	*priv_data_offset += sizeof(*ev_priv);
476

477
	if (ev_priv->user_handle) {
478
		ret = kfd_kmap_event_page(p, ev_priv->user_handle);
479
		if (ret)
480
			goto exit;
481
	}
482

483
	ev->type = ev_priv->type;
484
	ev->auto_reset = ev_priv->auto_reset;
485
	ev->signaled = ev_priv->signaled;
486

487
	spin_lock_init(&ev->lock);
488
	init_waitqueue_head(&ev->wq);
489

490
	mutex_lock(&p->event_mutex);
491
	switch (ev->type) {
492
	case KFD_EVENT_TYPE_SIGNAL:
493
	case KFD_EVENT_TYPE_DEBUG:
494
		ret = create_signal_event(devkfd, p, ev, &ev_priv->event_id);
495
		break;
496
	case KFD_EVENT_TYPE_MEMORY:
497
		memcpy(&ev->memory_exception_data,
498
			&ev_priv->memory_exception_data,
499
			sizeof(struct kfd_hsa_memory_exception_data));
500

501
		ret = create_other_event(p, ev, &ev_priv->event_id);
502
		break;
503
	case KFD_EVENT_TYPE_HW_EXCEPTION:
504
		memcpy(&ev->hw_exception_data,
505
			&ev_priv->hw_exception_data,
506
			sizeof(struct kfd_hsa_hw_exception_data));
507

508
		ret = create_other_event(p, ev, &ev_priv->event_id);
509
		break;
510
	}
511
	mutex_unlock(&p->event_mutex);
512

513
exit:
514
	if (ret)
515
		kfree(ev);
516

517
	kfree(ev_priv);
518

519
	return ret;
520
}
521

522
int kfd_criu_checkpoint_events(struct kfd_process *p,
523
			 uint8_t __user *user_priv_data,
524
			 uint64_t *priv_data_offset)
525
{
526
	struct kfd_criu_event_priv_data *ev_privs;
527
	int i = 0;
528
	int ret =  0;
529
	struct kfd_event *ev;
530
	uint32_t ev_id;
531

532
	uint32_t num_events = kfd_get_num_events(p);
533

534
	if (!num_events)
535
		return 0;
536

537
	ev_privs = kvzalloc(num_events * sizeof(*ev_privs), GFP_KERNEL);
538
	if (!ev_privs)
539
		return -ENOMEM;
540

541

542
	idr_for_each_entry(&p->event_idr, ev, ev_id) {
543
		struct kfd_criu_event_priv_data *ev_priv;
544

545
		/*
546
		 * Currently, all events have same size of private_data, but the current ioctl's
547
		 * and CRIU plugin supports private_data of variable sizes
548
		 */
549
		ev_priv = &ev_privs[i];
550

551
		ev_priv->object_type = KFD_CRIU_OBJECT_TYPE_EVENT;
552

553
		/* We store the user_handle with the first event */
554
		if (i == 0 && p->signal_page)
555
			ev_priv->user_handle = p->signal_handle;
556

557
		ev_priv->event_id = ev->event_id;
558
		ev_priv->auto_reset = ev->auto_reset;
559
		ev_priv->type = ev->type;
560
		ev_priv->signaled = ev->signaled;
561

562
		if (ev_priv->type == KFD_EVENT_TYPE_MEMORY)
563
			memcpy(&ev_priv->memory_exception_data,
564
				&ev->memory_exception_data,
565
				sizeof(struct kfd_hsa_memory_exception_data));
566
		else if (ev_priv->type == KFD_EVENT_TYPE_HW_EXCEPTION)
567
			memcpy(&ev_priv->hw_exception_data,
568
				&ev->hw_exception_data,
569
				sizeof(struct kfd_hsa_hw_exception_data));
570

571
		pr_debug("Checkpointed event[%d] id = 0x%08x auto_reset = %x type = %x signaled = %x\n",
572
			  i,
573
			  ev_priv->event_id,
574
			  ev_priv->auto_reset,
575
			  ev_priv->type,
576
			  ev_priv->signaled);
577
		i++;
578
	}
579

580
	ret = copy_to_user(user_priv_data + *priv_data_offset,
581
			   ev_privs, num_events * sizeof(*ev_privs));
582
	if (ret) {
583
		pr_err("Failed to copy events priv to user\n");
584
		ret = -EFAULT;
585
	}
586

587
	*priv_data_offset += num_events * sizeof(*ev_privs);
588

589
	kvfree(ev_privs);
590
	return ret;
591
}
592

593
int kfd_get_num_events(struct kfd_process *p)
594
{
595
	struct kfd_event *ev;
596
	uint32_t id;
597
	u32 num_events = 0;
598

599
	idr_for_each_entry(&p->event_idr, ev, id)
600
		num_events++;
601

602
	return num_events;
603
}
604

605
/* Assumes that p is current. */
606
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
607
{
608
	struct kfd_event *ev;
609
	int ret = 0;
610

611
	mutex_lock(&p->event_mutex);
612

613
	ev = lookup_event_by_id(p, event_id);
614

615
	if (ev)
616
		destroy_event(p, ev);
617
	else
618
		ret = -EINVAL;
619

620
	mutex_unlock(&p->event_mutex);
621
	return ret;
622
}
623

624
static void set_event(struct kfd_event *ev)
625
{
626
	struct kfd_event_waiter *waiter;
627

628
	/* Auto reset if the list is non-empty and we're waking
629
	 * someone. waitqueue_active is safe here because we're
630
	 * protected by the ev->lock, which is also held when
631
	 * updating the wait queues in kfd_wait_on_events.
632
	 */
633
	ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
634
	if (!(++ev->event_age)) {
635
		/* Never wrap back to reserved/default event age 0/1 */
636
		ev->event_age = 2;
637
		WARN_ONCE(1, "event_age wrap back!");
638
	}
639

640
	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
641
		WRITE_ONCE(waiter->activated, true);
642

643
	wake_up_all(&ev->wq);
644
}
645

646
/* Assumes that p is current. */
647
int kfd_set_event(struct kfd_process *p, uint32_t event_id)
648
{
649
	int ret = 0;
650
	struct kfd_event *ev;
651

652
	rcu_read_lock();
653

654
	ev = lookup_event_by_id(p, event_id);
655
	if (!ev) {
656
		ret = -EINVAL;
657
		goto unlock_rcu;
658
	}
659
	spin_lock(&ev->lock);
660

661
	if (event_can_be_cpu_signaled(ev))
662
		set_event(ev);
663
	else
664
		ret = -EINVAL;
665

666
	spin_unlock(&ev->lock);
667
unlock_rcu:
668
	rcu_read_unlock();
669
	return ret;
670
}
671

672
static void reset_event(struct kfd_event *ev)
673
{
674
	ev->signaled = false;
675
}
676

677
/* Assumes that p is current. */
678
int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
679
{
680
	int ret = 0;
681
	struct kfd_event *ev;
682

683
	rcu_read_lock();
684

685
	ev = lookup_event_by_id(p, event_id);
686
	if (!ev) {
687
		ret = -EINVAL;
688
		goto unlock_rcu;
689
	}
690
	spin_lock(&ev->lock);
691

692
	if (event_can_be_cpu_signaled(ev))
693
		reset_event(ev);
694
	else
695
		ret = -EINVAL;
696

697
	spin_unlock(&ev->lock);
698
unlock_rcu:
699
	rcu_read_unlock();
700
	return ret;
701

702
}
703

704
static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
705
{
706
	WRITE_ONCE(page_slots(p->signal_page)[ev->event_id], UNSIGNALED_EVENT_SLOT);
707
}
708

709
static void set_event_from_interrupt(struct kfd_process *p,
710
					struct kfd_event *ev)
711
{
712
	if (ev && event_can_be_gpu_signaled(ev)) {
713
		acknowledge_signal(p, ev);
714
		spin_lock(&ev->lock);
715
		set_event(ev);
716
		spin_unlock(&ev->lock);
717
	}
718
}
719

720
void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
721
				uint32_t valid_id_bits)
722
{
723
	struct kfd_event *ev = NULL;
724

725
	/*
726
	 * Because we are called from arbitrary context (workqueue) as opposed
727
	 * to process context, kfd_process could attempt to exit while we are
728
	 * running so the lookup function increments the process ref count.
729
	 */
730
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
731

732
	if (!p)
733
		return; /* Presumably process exited. */
734

735
	rcu_read_lock();
736

737
	if (valid_id_bits)
738
		ev = lookup_signaled_event_by_partial_id(p, partial_id,
739
							 valid_id_bits);
740
	if (ev) {
741
		set_event_from_interrupt(p, ev);
742
	} else if (p->signal_page) {
743
		/*
744
		 * Partial ID lookup failed. Assume that the event ID
745
		 * in the interrupt payload was invalid and do an
746
		 * exhaustive search of signaled events.
747
		 */
748
		uint64_t *slots = page_slots(p->signal_page);
749
		uint32_t id;
750

751
		/*
752
		 * If id is valid but slot is not signaled, GPU may signal the same event twice
753
		 * before driver have chance to process the first interrupt, then signal slot is
754
		 * auto-reset after set_event wakeup the user space, just drop the second event as
755
		 * the application only need wakeup once.
756
		 */
757
		if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
758
		    partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
759
			goto out_unlock;
760

761
		if (valid_id_bits)
762
			pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
763
					     partial_id, valid_id_bits);
764

765
		if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
766
			/* With relatively few events, it's faster to
767
			 * iterate over the event IDR
768
			 */
769
			idr_for_each_entry(&p->event_idr, ev, id) {
770
				if (id >= KFD_SIGNAL_EVENT_LIMIT)
771
					break;
772

773
				if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT)
774
					set_event_from_interrupt(p, ev);
775
			}
776
		} else {
777
			/* With relatively many events, it's faster to
778
			 * iterate over the signal slots and lookup
779
			 * only signaled events from the IDR.
780
			 */
781
			for (id = 1; id < KFD_SIGNAL_EVENT_LIMIT; id++)
782
				if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT) {
783
					ev = lookup_event_by_id(p, id);
784
					set_event_from_interrupt(p, ev);
785
				}
786
		}
787
	}
788

789
out_unlock:
790
	rcu_read_unlock();
791
	kfd_unref_process(p);
792
}
793

794
static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
795
{
796
	struct kfd_event_waiter *event_waiters;
797
	uint32_t i;
798

799
	event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter),
800
				GFP_KERNEL);
801
	if (!event_waiters)
802
		return NULL;
803

804
	for (i = 0; i < num_events; i++)
805
		init_wait(&event_waiters[i].wait);
806

807
	return event_waiters;
808
}
809

810
static int init_event_waiter(struct kfd_process *p,
811
		struct kfd_event_waiter *waiter,
812
		struct kfd_event_data *event_data)
813
{
814
	struct kfd_event *ev = lookup_event_by_id(p, event_data->event_id);
815

816
	if (!ev)
817
		return -EINVAL;
818

819
	spin_lock(&ev->lock);
820
	waiter->event = ev;
821
	waiter->activated = ev->signaled;
822
	ev->signaled = ev->signaled && !ev->auto_reset;
823

824
	/* last_event_age = 0 reserved for backward compatible */
825
	if (waiter->event->type == KFD_EVENT_TYPE_SIGNAL &&
826
		event_data->signal_event_data.last_event_age) {
827
		waiter->event_age_enabled = true;
828
		if (ev->event_age != event_data->signal_event_data.last_event_age)
829
			waiter->activated = true;
830
	}
831

832
	if (!waiter->activated)
833
		add_wait_queue(&ev->wq, &waiter->wait);
834
	spin_unlock(&ev->lock);
835

836
	return 0;
837
}
838

839
/* test_event_condition - Test condition of events being waited for
840
 * @all:           Return completion only if all events have signaled
841
 * @num_events:    Number of events to wait for
842
 * @event_waiters: Array of event waiters, one per event
843
 *
844
 * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
845
 * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
846
 * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
847
 * the events have been destroyed.
848
 */
849
static uint32_t test_event_condition(bool all, uint32_t num_events,
850
				struct kfd_event_waiter *event_waiters)
851
{
852
	uint32_t i;
853
	uint32_t activated_count = 0;
854

855
	for (i = 0; i < num_events; i++) {
856
		if (!READ_ONCE(event_waiters[i].event))
857
			return KFD_IOC_WAIT_RESULT_FAIL;
858

859
		if (READ_ONCE(event_waiters[i].activated)) {
860
			if (!all)
861
				return KFD_IOC_WAIT_RESULT_COMPLETE;
862

863
			activated_count++;
864
		}
865
	}
866

867
	return activated_count == num_events ?
868
		KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
869
}
870

871
/*
872
 * Copy event specific data, if defined.
873
 * Currently only memory exception events have additional data to copy to user
874
 */
875
static int copy_signaled_event_data(uint32_t num_events,
876
		struct kfd_event_waiter *event_waiters,
877
		struct kfd_event_data __user *data)
878
{
879
	void *src;
880
	void __user *dst;
881
	struct kfd_event_waiter *waiter;
882
	struct kfd_event *event;
883
	uint32_t i, size = 0;
884

885
	for (i = 0; i < num_events; i++) {
886
		waiter = &event_waiters[i];
887
		event = waiter->event;
888
		if (!event)
889
			return -EINVAL; /* event was destroyed */
890
		if (waiter->activated) {
891
			if (event->type == KFD_EVENT_TYPE_MEMORY) {
892
				dst = &data[i].memory_exception_data;
893
				src = &event->memory_exception_data;
894
				size = sizeof(struct kfd_hsa_memory_exception_data);
895
			} else if (event->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
896
				dst = &data[i].memory_exception_data;
897
				src = &event->hw_exception_data;
898
				size = sizeof(struct kfd_hsa_hw_exception_data);
899
			} else if (event->type == KFD_EVENT_TYPE_SIGNAL &&
900
				waiter->event_age_enabled) {
901
				dst = &data[i].signal_event_data.last_event_age;
902
				src = &event->event_age;
903
				size = sizeof(u64);
904
			}
905
			if (size && copy_to_user(dst, src, size))
906
				return -EFAULT;
907
		}
908
	}
909

910
	return 0;
911
}
912

913
static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
914
{
915
	if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
916
		return 0;
917

918
	if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
919
		return MAX_SCHEDULE_TIMEOUT;
920

921
	/*
922
	 * msecs_to_jiffies interprets all values above 2^31-1 as infinite,
923
	 * but we consider them finite.
924
	 * This hack is wrong, but nobody is likely to notice.
925
	 */
926
	user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
927

928
	return msecs_to_jiffies(user_timeout_ms) + 1;
929
}
930

931
static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters,
932
			 bool undo_auto_reset)
933
{
934
	uint32_t i;
935

936
	for (i = 0; i < num_events; i++)
937
		if (waiters[i].event) {
938
			spin_lock(&waiters[i].event->lock);
939
			remove_wait_queue(&waiters[i].event->wq,
940
					  &waiters[i].wait);
941
			if (undo_auto_reset && waiters[i].activated &&
942
			    waiters[i].event && waiters[i].event->auto_reset)
943
				set_event(waiters[i].event);
944
			spin_unlock(&waiters[i].event->lock);
945
		}
946

947
	kfree(waiters);
948
}
949

950
int kfd_wait_on_events(struct kfd_process *p,
951
		       uint32_t num_events, void __user *data,
952
		       bool all, uint32_t *user_timeout_ms,
953
		       uint32_t *wait_result)
954
{
955
	struct kfd_event_data __user *events =
956
			(struct kfd_event_data __user *) data;
957
	uint32_t i;
958
	int ret = 0;
959

960
	struct kfd_event_waiter *event_waiters = NULL;
961
	long timeout = user_timeout_to_jiffies(*user_timeout_ms);
962

963
	event_waiters = alloc_event_waiters(num_events);
964
	if (!event_waiters) {
965
		ret = -ENOMEM;
966
		goto out;
967
	}
968

969
	/* Use p->event_mutex here to protect against concurrent creation and
970
	 * destruction of events while we initialize event_waiters.
971
	 */
972
	mutex_lock(&p->event_mutex);
973

974
	for (i = 0; i < num_events; i++) {
975
		struct kfd_event_data event_data;
976

977
		if (copy_from_user(&event_data, &events[i],
978
				sizeof(struct kfd_event_data))) {
979
			ret = -EFAULT;
980
			goto out_unlock;
981
		}
982

983
		ret = init_event_waiter(p, &event_waiters[i], &event_data);
984
		if (ret)
985
			goto out_unlock;
986
	}
987

988
	/* Check condition once. */
989
	*wait_result = test_event_condition(all, num_events, event_waiters);
990
	if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
991
		ret = copy_signaled_event_data(num_events,
992
					       event_waiters, events);
993
		goto out_unlock;
994
	} else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
995
		/* This should not happen. Events shouldn't be
996
		 * destroyed while we're holding the event_mutex
997
		 */
998
		goto out_unlock;
999
	}
1000

1001
	mutex_unlock(&p->event_mutex);
1002

1003
	while (true) {
1004
		if (fatal_signal_pending(current)) {
1005
			ret = -EINTR;
1006
			break;
1007
		}
1008

1009
		if (signal_pending(current)) {
1010
			ret = -ERESTARTSYS;
1011
			if (*user_timeout_ms != KFD_EVENT_TIMEOUT_IMMEDIATE &&
1012
			    *user_timeout_ms != KFD_EVENT_TIMEOUT_INFINITE)
1013
				*user_timeout_ms = jiffies_to_msecs(
1014
					max(0l, timeout-1));
1015
			break;
1016
		}
1017

1018
		/* Set task state to interruptible sleep before
1019
		 * checking wake-up conditions. A concurrent wake-up
1020
		 * will put the task back into runnable state. In that
1021
		 * case schedule_timeout will not put the task to
1022
		 * sleep and we'll get a chance to re-check the
1023
		 * updated conditions almost immediately. Otherwise,
1024
		 * this race condition would lead to a soft hang or a
1025
		 * very long sleep.
1026
		 */
1027
		set_current_state(TASK_INTERRUPTIBLE);
1028

1029
		*wait_result = test_event_condition(all, num_events,
1030
						    event_waiters);
1031
		if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
1032
			break;
1033

1034
		if (timeout <= 0)
1035
			break;
1036

1037
		timeout = schedule_timeout(timeout);
1038
	}
1039
	__set_current_state(TASK_RUNNING);
1040

1041
	mutex_lock(&p->event_mutex);
1042
	/* copy_signaled_event_data may sleep. So this has to happen
1043
	 * after the task state is set back to RUNNING.
1044
	 *
1045
	 * The event may also have been destroyed after signaling. So
1046
	 * copy_signaled_event_data also must confirm that the event
1047
	 * still exists. Therefore this must be under the p->event_mutex
1048
	 * which is also held when events are destroyed.
1049
	 */
1050
	if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
1051
		ret = copy_signaled_event_data(num_events,
1052
					       event_waiters, events);
1053

1054
out_unlock:
1055
	free_waiters(num_events, event_waiters, ret == -ERESTARTSYS);
1056
	mutex_unlock(&p->event_mutex);
1057
out:
1058
	if (ret)
1059
		*wait_result = KFD_IOC_WAIT_RESULT_FAIL;
1060
	else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
1061
		ret = -EIO;
1062

1063
	return ret;
1064
}
1065

1066
int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
1067
{
1068
	unsigned long pfn;
1069
	struct kfd_signal_page *page;
1070
	int ret;
1071

1072
	/* check required size doesn't exceed the allocated size */
1073
	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
1074
			get_order(vma->vm_end - vma->vm_start)) {
1075
		pr_err("Event page mmap requested illegal size\n");
1076
		return -EINVAL;
1077
	}
1078

1079
	page = p->signal_page;
1080
	if (!page) {
1081
		/* Probably KFD bug, but mmap is user-accessible. */
1082
		pr_debug("Signal page could not be found\n");
1083
		return -EINVAL;
1084
	}
1085

1086
	pfn = __pa(page->kernel_address);
1087
	pfn >>= PAGE_SHIFT;
1088

1089
	vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE
1090
		       | VM_DONTDUMP | VM_PFNMAP);
1091

1092
	pr_debug("Mapping signal page\n");
1093
	pr_debug("     start user address  == 0x%08lx\n", vma->vm_start);
1094
	pr_debug("     end user address    == 0x%08lx\n", vma->vm_end);
1095
	pr_debug("     pfn                 == 0x%016lX\n", pfn);
1096
	pr_debug("     vm_flags            == 0x%08lX\n", vma->vm_flags);
1097
	pr_debug("     size                == 0x%08lX\n",
1098
			vma->vm_end - vma->vm_start);
1099

1100
	page->user_address = (uint64_t __user *)vma->vm_start;
1101

1102
	/* mapping the page to user process */
1103
	ret = remap_pfn_range(vma, vma->vm_start, pfn,
1104
			vma->vm_end - vma->vm_start, vma->vm_page_prot);
1105
	if (!ret)
1106
		p->signal_mapped_size = vma->vm_end - vma->vm_start;
1107

1108
	return ret;
1109
}
1110

1111
/*
1112
 * Assumes that p is not going away.
1113
 */
1114
static void lookup_events_by_type_and_signal(struct kfd_process *p,
1115
		int type, void *event_data)
1116
{
1117
	struct kfd_hsa_memory_exception_data *ev_data;
1118
	struct kfd_event *ev;
1119
	uint32_t id;
1120
	bool send_signal = true;
1121

1122
	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
1123

1124
	rcu_read_lock();
1125

1126
	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1127
	idr_for_each_entry_continue(&p->event_idr, ev, id)
1128
		if (ev->type == type) {
1129
			send_signal = false;
1130
			dev_dbg(kfd_device,
1131
					"Event found: id %X type %d",
1132
					ev->event_id, ev->type);
1133
			spin_lock(&ev->lock);
1134
			set_event(ev);
1135
			if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
1136
				ev->memory_exception_data = *ev_data;
1137
			spin_unlock(&ev->lock);
1138
		}
1139

1140
	if (type == KFD_EVENT_TYPE_MEMORY) {
1141
		dev_warn(kfd_device,
1142
			"Sending SIGSEGV to process pid %d",
1143
				p->lead_thread->pid);
1144
		send_sig(SIGSEGV, p->lead_thread, 0);
1145
	}
1146

1147
	/* Send SIGTERM no event of type "type" has been found*/
1148
	if (send_signal) {
1149
		if (send_sigterm) {
1150
			dev_warn(kfd_device,
1151
				"Sending SIGTERM to process pid %d",
1152
					p->lead_thread->pid);
1153
			send_sig(SIGTERM, p->lead_thread, 0);
1154
		} else {
1155
			dev_err(kfd_device,
1156
				"Process pid %d got unhandled exception",
1157
				p->lead_thread->pid);
1158
		}
1159
	}
1160

1161
	rcu_read_unlock();
1162
}
1163

1164
void kfd_signal_hw_exception_event(u32 pasid)
1165
{
1166
	/*
1167
	 * Because we are called from arbitrary context (workqueue) as opposed
1168
	 * to process context, kfd_process could attempt to exit while we are
1169
	 * running so the lookup function increments the process ref count.
1170
	 */
1171
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
1172

1173
	if (!p)
1174
		return; /* Presumably process exited. */
1175

1176
	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
1177
	kfd_unref_process(p);
1178
}
1179

1180
void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va)
1181
{
1182
	struct kfd_process_device *pdd;
1183
	struct kfd_hsa_memory_exception_data exception_data;
1184
	int i;
1185

1186
	memset(&exception_data, 0, sizeof(exception_data));
1187
	exception_data.va = gpu_va;
1188
	exception_data.failure.NotPresent = 1;
1189

1190
	// Send VM seg fault to all kfd process device
1191
	for (i = 0; i < p->n_pdds; i++) {
1192
		pdd = p->pdds[i];
1193
		exception_data.gpu_id = pdd->user_gpu_id;
1194
		kfd_evict_process_device(pdd);
1195
		kfd_signal_vm_fault_event(pdd, NULL, &exception_data);
1196
	}
1197
}
1198

1199
void kfd_signal_vm_fault_event(struct kfd_process_device *pdd,
1200
				struct kfd_vm_fault_info *info,
1201
				struct kfd_hsa_memory_exception_data *data)
1202
{
1203
	struct kfd_event *ev;
1204
	uint32_t id;
1205
	struct kfd_process *p = pdd->process;
1206
	struct kfd_hsa_memory_exception_data memory_exception_data;
1207
	int user_gpu_id;
1208

1209
	user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
1210
	if (unlikely(user_gpu_id == -EINVAL)) {
1211
		WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
1212
			  pdd->dev->id);
1213
		return;
1214
	}
1215

1216
	/* SoC15 chips and onwards will pass in data from now on. */
1217
	if (!data) {
1218
		memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1219
		memory_exception_data.gpu_id = user_gpu_id;
1220
		memory_exception_data.failure.imprecise = true;
1221

1222
		/* Set failure reason */
1223
		if (info) {
1224
			memory_exception_data.va = (info->page_addr) <<
1225
								PAGE_SHIFT;
1226
			memory_exception_data.failure.NotPresent =
1227
				info->prot_valid ? 1 : 0;
1228
			memory_exception_data.failure.NoExecute =
1229
				info->prot_exec ? 1 : 0;
1230
			memory_exception_data.failure.ReadOnly =
1231
				info->prot_write ? 1 : 0;
1232
			memory_exception_data.failure.imprecise = 0;
1233
		}
1234
	}
1235

1236
	rcu_read_lock();
1237

1238
	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1239
	idr_for_each_entry_continue(&p->event_idr, ev, id)
1240
		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1241
			spin_lock(&ev->lock);
1242
			ev->memory_exception_data = data ? *data :
1243
							memory_exception_data;
1244
			set_event(ev);
1245
			spin_unlock(&ev->lock);
1246
		}
1247

1248
	rcu_read_unlock();
1249
}
1250

1251
void kfd_signal_reset_event(struct kfd_node *dev)
1252
{
1253
	struct kfd_hsa_hw_exception_data hw_exception_data;
1254
	struct kfd_hsa_memory_exception_data memory_exception_data;
1255
	struct kfd_process *p;
1256
	struct kfd_event *ev;
1257
	unsigned int temp;
1258
	uint32_t id, idx;
1259
	int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1260
			KFD_HW_EXCEPTION_ECC :
1261
			KFD_HW_EXCEPTION_GPU_HANG;
1262

1263
	/* Whole gpu reset caused by GPU hang and memory is lost */
1264
	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1265
	hw_exception_data.memory_lost = 1;
1266
	hw_exception_data.reset_cause = reset_cause;
1267

1268
	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1269
	memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1270
	memory_exception_data.failure.imprecise = true;
1271

1272
	idx = srcu_read_lock(&kfd_processes_srcu);
1273
	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
1274
		int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1275
		struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
1276

1277
		if (unlikely(user_gpu_id == -EINVAL)) {
1278
			WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1279
			continue;
1280
		}
1281

1282
		if (unlikely(!pdd)) {
1283
			WARN_ONCE(1, "Could not get device data from process pid:%d\n",
1284
				  p->lead_thread->pid);
1285
			continue;
1286
		}
1287

1288
		if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
1289
			continue;
1290

1291
		if (dev->dqm->detect_hang_count) {
1292
			struct amdgpu_task_info *ti;
1293
			struct amdgpu_fpriv *drv_priv;
1294

1295
			if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) {
1296
				WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n",
1297
					  dev->id, p->lead_thread->pid);
1298
				continue;
1299
			}
1300

1301
			ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
1302
			if (ti) {
1303
				dev_err(dev->adev->dev,
1304
					"Queues reset on process %s tid %d thread %s pid %d\n",
1305
					ti->process_name, ti->tgid, ti->task.comm, ti->task.pid);
1306
				amdgpu_vm_put_task_info(ti);
1307
			}
1308
		}
1309

1310
		rcu_read_lock();
1311

1312
		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1313
		idr_for_each_entry_continue(&p->event_idr, ev, id) {
1314
			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1315
				spin_lock(&ev->lock);
1316
				ev->hw_exception_data = hw_exception_data;
1317
				ev->hw_exception_data.gpu_id = user_gpu_id;
1318
				set_event(ev);
1319
				spin_unlock(&ev->lock);
1320
			}
1321
			if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1322
			    reset_cause == KFD_HW_EXCEPTION_ECC) {
1323
				spin_lock(&ev->lock);
1324
				ev->memory_exception_data = memory_exception_data;
1325
				ev->memory_exception_data.gpu_id = user_gpu_id;
1326
				set_event(ev);
1327
				spin_unlock(&ev->lock);
1328
			}
1329
		}
1330

1331
		rcu_read_unlock();
1332
	}
1333
	srcu_read_unlock(&kfd_processes_srcu, idx);
1334
}
1335

1336
void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
1337
{
1338
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
1339
	struct kfd_hsa_memory_exception_data memory_exception_data;
1340
	struct kfd_hsa_hw_exception_data hw_exception_data;
1341
	struct kfd_event *ev;
1342
	uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1343
	int user_gpu_id;
1344

1345
	if (!p) {
1346
		dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", pasid);
1347
		return; /* Presumably process exited. */
1348
	}
1349

1350
	user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
1351
	if (unlikely(user_gpu_id == -EINVAL)) {
1352
		WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
1353
		kfd_unref_process(p);
1354
		return;
1355
	}
1356

1357
	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1358
	hw_exception_data.gpu_id = user_gpu_id;
1359
	hw_exception_data.memory_lost = 1;
1360
	hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
1361

1362
	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1363
	memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
1364
	memory_exception_data.gpu_id = user_gpu_id;
1365
	memory_exception_data.failure.imprecise = true;
1366

1367
	rcu_read_lock();
1368

1369
	idr_for_each_entry_continue(&p->event_idr, ev, id) {
1370
		if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1371
			spin_lock(&ev->lock);
1372
			ev->hw_exception_data = hw_exception_data;
1373
			set_event(ev);
1374
			spin_unlock(&ev->lock);
1375
		}
1376

1377
		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
1378
			spin_lock(&ev->lock);
1379
			ev->memory_exception_data = memory_exception_data;
1380
			set_event(ev);
1381
			spin_unlock(&ev->lock);
1382
		}
1383
	}
1384

1385
	dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
1386
		p->lead_thread->comm, pasid);
1387
	rcu_read_unlock();
1388

1389
	/* user application will handle SIGBUS signal */
1390
	send_sig(SIGBUS, p->lead_thread, 0);
1391

1392
	kfd_unref_process(p);
1393
}
1394

1395
Product

Resources

Company