Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
26516 views
1
/*
2
* Copyright 2023 Advanced Micro Devices, Inc.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice shall be included in
12
* all copies or substantial portions of the Software.
13
*
14
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
* OTHER DEALINGS IN THE SOFTWARE.
21
*/
22
23
#include "kfd_debug.h"
24
#include "kfd_device_queue_manager.h"
25
#include "kfd_topology.h"
26
#include <linux/file.h>
27
#include <uapi/linux/kfd_ioctl.h>
28
#include <uapi/linux/kfd_sysfs.h>
29
30
#define MAX_WATCH_ADDRESSES 4
31
32
int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33
unsigned int *queue_id,
34
unsigned int *gpu_id,
35
uint64_t exception_clear_mask,
36
uint64_t *event_status)
37
{
38
struct process_queue_manager *pqm;
39
struct process_queue_node *pqn;
40
int i;
41
42
if (!(process && process->debug_trap_enabled))
43
return -ENODATA;
44
45
mutex_lock(&process->event_mutex);
46
*event_status = 0;
47
*queue_id = 0;
48
*gpu_id = 0;
49
50
/* find and report queue events */
51
pqm = &process->pqm;
52
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53
uint64_t tmp = process->exception_enable_mask;
54
55
if (!pqn->q)
56
continue;
57
58
tmp &= pqn->q->properties.exception_status;
59
60
if (!tmp)
61
continue;
62
63
*event_status = pqn->q->properties.exception_status;
64
*queue_id = pqn->q->properties.queue_id;
65
*gpu_id = pqn->q->device->id;
66
pqn->q->properties.exception_status &= ~exception_clear_mask;
67
goto out;
68
}
69
70
/* find and report device events */
71
for (i = 0; i < process->n_pdds; i++) {
72
struct kfd_process_device *pdd = process->pdds[i];
73
uint64_t tmp = process->exception_enable_mask
74
& pdd->exception_status;
75
76
if (!tmp)
77
continue;
78
79
*event_status = pdd->exception_status;
80
*gpu_id = pdd->dev->id;
81
pdd->exception_status &= ~exception_clear_mask;
82
goto out;
83
}
84
85
/* report process events */
86
if (process->exception_enable_mask & process->exception_status) {
87
*event_status = process->exception_status;
88
process->exception_status &= ~exception_clear_mask;
89
}
90
91
out:
92
mutex_unlock(&process->event_mutex);
93
return *event_status ? 0 : -EAGAIN;
94
}
95
96
void debug_event_write_work_handler(struct work_struct *work)
97
{
98
struct kfd_process *process;
99
100
static const char write_data = '.';
101
loff_t pos = 0;
102
103
process = container_of(work,
104
struct kfd_process,
105
debug_event_workarea);
106
107
if (process->debug_trap_enabled && process->dbg_ev_file)
108
kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
109
}
110
111
/* update process/device/queue exception status, write to descriptor
112
* only if exception_status is enabled.
113
*/
114
bool kfd_dbg_ev_raise(uint64_t event_mask,
115
struct kfd_process *process, struct kfd_node *dev,
116
unsigned int source_id, bool use_worker,
117
void *exception_data, size_t exception_data_size)
118
{
119
struct process_queue_manager *pqm;
120
struct process_queue_node *pqn;
121
int i;
122
static const char write_data = '.';
123
loff_t pos = 0;
124
bool is_subscribed = true;
125
126
if (!(process && process->debug_trap_enabled))
127
return false;
128
129
mutex_lock(&process->event_mutex);
130
131
if (event_mask & KFD_EC_MASK_DEVICE) {
132
for (i = 0; i < process->n_pdds; i++) {
133
struct kfd_process_device *pdd = process->pdds[i];
134
135
if (pdd->dev != dev)
136
continue;
137
138
pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
139
140
if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141
if (!pdd->vm_fault_exc_data) {
142
pdd->vm_fault_exc_data = kmemdup(
143
exception_data,
144
exception_data_size,
145
GFP_KERNEL);
146
if (!pdd->vm_fault_exc_data)
147
pr_debug("Failed to allocate exception data memory");
148
} else {
149
pr_debug("Debugger exception data not saved\n");
150
print_hex_dump_bytes("exception data: ",
151
DUMP_PREFIX_OFFSET,
152
exception_data,
153
exception_data_size);
154
}
155
}
156
break;
157
}
158
} else if (event_mask & KFD_EC_MASK_PROCESS) {
159
process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
160
} else {
161
pqm = &process->pqm;
162
list_for_each_entry(pqn, &pqm->queues,
163
process_queue_list) {
164
int target_id;
165
166
if (!pqn->q)
167
continue;
168
169
target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170
pqn->q->properties.queue_id :
171
pqn->q->doorbell_id;
172
173
if (pqn->q->device != dev || target_id != source_id)
174
continue;
175
176
pqn->q->properties.exception_status |= event_mask;
177
break;
178
}
179
}
180
181
if (process->exception_enable_mask & event_mask) {
182
if (use_worker)
183
schedule_work(&process->debug_event_workarea);
184
else
185
kernel_write(process->dbg_ev_file,
186
&write_data,
187
1,
188
&pos);
189
} else {
190
is_subscribed = false;
191
}
192
193
mutex_unlock(&process->event_mutex);
194
195
return is_subscribed;
196
}
197
198
/* set pending event queue entry from ring entry */
199
bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
200
unsigned int pasid,
201
uint32_t doorbell_id,
202
uint64_t trap_mask,
203
void *exception_data,
204
size_t exception_data_size)
205
{
206
struct kfd_process *p;
207
struct kfd_process_device *pdd = NULL;
208
bool signaled_to_debugger_or_runtime = false;
209
210
p = kfd_lookup_process_by_pasid(pasid, &pdd);
211
212
if (!pdd)
213
return false;
214
215
if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
216
exception_data, exception_data_size)) {
217
struct process_queue_manager *pqm;
218
struct process_queue_node *pqn;
219
220
if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
221
p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
222
mutex_lock(&p->mutex);
223
224
pqm = &p->pqm;
225
list_for_each_entry(pqn, &pqm->queues,
226
process_queue_list) {
227
228
if (!(pqn->q && pqn->q->device == dev &&
229
pqn->q->doorbell_id == doorbell_id))
230
continue;
231
232
kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
233
trap_mask);
234
235
signaled_to_debugger_or_runtime = true;
236
237
break;
238
}
239
240
mutex_unlock(&p->mutex);
241
} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
242
kfd_evict_process_device(pdd);
243
kfd_signal_vm_fault_event(pdd, NULL, exception_data);
244
245
signaled_to_debugger_or_runtime = true;
246
}
247
} else {
248
signaled_to_debugger_or_runtime = true;
249
}
250
251
kfd_unref_process(p);
252
253
return signaled_to_debugger_or_runtime;
254
}
255
256
int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
257
unsigned int dev_id,
258
unsigned int queue_id,
259
uint64_t error_reason)
260
{
261
if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262
struct kfd_process_device *pdd = NULL;
263
struct kfd_hsa_memory_exception_data *data;
264
int i;
265
266
for (i = 0; i < p->n_pdds; i++) {
267
if (p->pdds[i]->dev->id == dev_id) {
268
pdd = p->pdds[i];
269
break;
270
}
271
}
272
273
if (!pdd)
274
return -ENODEV;
275
276
data = (struct kfd_hsa_memory_exception_data *)
277
pdd->vm_fault_exc_data;
278
279
kfd_evict_process_device(pdd);
280
kfd_signal_vm_fault_event(pdd, NULL, data);
281
error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
282
}
283
284
if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
285
/*
286
* block should only happen after the debugger receives runtime
287
* enable notice.
288
*/
289
up(&p->runtime_enable_sema);
290
error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
291
}
292
293
if (error_reason)
294
return kfd_send_exception_to_runtime(p, queue_id, error_reason);
295
296
return 0;
297
}
298
299
static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
300
{
301
struct mqd_update_info minfo = {0};
302
int err;
303
304
if (!q)
305
return 0;
306
307
if (!kfd_dbg_has_cwsr_workaround(q->device))
308
return 0;
309
310
if (enable && q->properties.is_user_cu_masked)
311
return -EBUSY;
312
313
minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
314
315
q->properties.is_dbg_wa = enable;
316
err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
317
if (err)
318
q->properties.is_dbg_wa = false;
319
320
return err;
321
}
322
323
static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
324
{
325
struct process_queue_manager *pqm = &target->pqm;
326
struct process_queue_node *pqn;
327
int r = 0;
328
329
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330
r = kfd_dbg_set_queue_workaround(pqn->q, enable);
331
if (enable && r)
332
goto unwind;
333
}
334
335
return 0;
336
337
unwind:
338
list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339
kfd_dbg_set_queue_workaround(pqn->q, false);
340
341
if (enable)
342
target->runtime_info.runtime_state = r == -EBUSY ?
343
DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
345
346
return r;
347
}
348
349
int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
350
{
351
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352
uint32_t flags = pdd->process->dbg_flags;
353
struct amdgpu_device *adev = pdd->dev->adev;
354
int r;
355
356
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
357
return 0;
358
359
if (!pdd->proc_ctx_cpu_ptr) {
360
r = amdgpu_amdkfd_alloc_gtt_mem(adev,
361
AMDGPU_MES_PROC_CTX_SIZE,
362
&pdd->proc_ctx_bo,
363
&pdd->proc_ctx_gpu_addr,
364
&pdd->proc_ctx_cpu_ptr,
365
false);
366
if (r) {
367
dev_err(adev->dev,
368
"failed to allocate process context bo\n");
369
return r;
370
}
371
memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
372
}
373
374
return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
375
pdd->watch_points, flags, sq_trap_en);
376
}
377
378
#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
379
static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
380
{
381
int i;
382
383
*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
384
385
spin_lock(&pdd->dev->watch_points_lock);
386
387
for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
388
/* device watchpoint in use so skip */
389
if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
390
continue;
391
392
pdd->alloc_watch_ids |= 0x1 << i;
393
pdd->dev->alloc_watch_ids |= 0x1 << i;
394
*watch_id = i;
395
spin_unlock(&pdd->dev->watch_points_lock);
396
return 0;
397
}
398
399
spin_unlock(&pdd->dev->watch_points_lock);
400
401
return -ENOMEM;
402
}
403
404
static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
405
{
406
spin_lock(&pdd->dev->watch_points_lock);
407
408
/* process owns device watch point so safe to clear */
409
if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
410
pdd->alloc_watch_ids &= ~(0x1 << watch_id);
411
pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
412
}
413
414
spin_unlock(&pdd->dev->watch_points_lock);
415
}
416
417
static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
418
{
419
bool owns_watch_id = false;
420
421
spin_lock(&pdd->dev->watch_points_lock);
422
owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
423
((pdd->alloc_watch_ids >> watch_id) & 0x1);
424
425
spin_unlock(&pdd->dev->watch_points_lock);
426
427
return owns_watch_id;
428
}
429
430
int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
431
uint32_t watch_id)
432
{
433
int r;
434
435
if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
436
return -EINVAL;
437
438
if (!pdd->dev->kfd->shared_resources.enable_mes) {
439
r = debug_lock_and_unmap(pdd->dev->dqm);
440
if (r)
441
return r;
442
}
443
444
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
445
pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
446
pdd->dev->adev,
447
watch_id);
448
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
449
450
if (!pdd->dev->kfd->shared_resources.enable_mes)
451
r = debug_map_and_unlock(pdd->dev->dqm);
452
else
453
r = kfd_dbg_set_mes_debug_mode(pdd, true);
454
455
kfd_dbg_clear_dev_watch_id(pdd, watch_id);
456
457
return r;
458
}
459
460
int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
461
uint64_t watch_address,
462
uint32_t watch_address_mask,
463
uint32_t *watch_id,
464
uint32_t watch_mode)
465
{
466
int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
467
uint32_t xcc_mask = pdd->dev->xcc_mask;
468
469
if (r)
470
return r;
471
472
if (!pdd->dev->kfd->shared_resources.enable_mes) {
473
r = debug_lock_and_unmap(pdd->dev->dqm);
474
if (r) {
475
kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
476
return r;
477
}
478
}
479
480
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
481
for_each_inst(xcc_id, xcc_mask)
482
pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
483
pdd->dev->adev,
484
watch_address,
485
watch_address_mask,
486
*watch_id,
487
watch_mode,
488
pdd->dev->vm_info.last_vmid_kfd,
489
xcc_id);
490
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
491
492
if (!pdd->dev->kfd->shared_resources.enable_mes)
493
r = debug_map_and_unlock(pdd->dev->dqm);
494
else
495
r = kfd_dbg_set_mes_debug_mode(pdd, true);
496
497
/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
498
if (r)
499
kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
500
501
return 0;
502
}
503
504
static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
505
{
506
int i, j;
507
508
for (i = 0; i < target->n_pdds; i++)
509
for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
510
kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
511
}
512
513
int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
514
{
515
uint32_t prev_flags = target->dbg_flags;
516
int i, r = 0, rewind_count = 0;
517
518
for (i = 0; i < target->n_pdds; i++) {
519
struct kfd_topology_device *topo_dev =
520
kfd_topology_device_by_id(target->pdds[i]->dev->id);
521
uint32_t caps = topo_dev->node_props.capability;
522
523
if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
524
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
525
*flags = prev_flags;
526
return -EACCES;
527
}
528
529
if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
530
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
531
*flags = prev_flags;
532
return -EACCES;
533
}
534
}
535
536
target->dbg_flags = *flags;
537
*flags = prev_flags;
538
for (i = 0; i < target->n_pdds; i++) {
539
struct kfd_process_device *pdd = target->pdds[i];
540
541
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
542
continue;
543
544
if (!pdd->dev->kfd->shared_resources.enable_mes)
545
r = debug_refresh_runlist(pdd->dev->dqm);
546
else
547
r = kfd_dbg_set_mes_debug_mode(pdd, true);
548
549
if (r) {
550
target->dbg_flags = prev_flags;
551
break;
552
}
553
554
rewind_count++;
555
}
556
557
/* Rewind flags */
558
if (r) {
559
target->dbg_flags = prev_flags;
560
561
for (i = 0; i < rewind_count; i++) {
562
struct kfd_process_device *pdd = target->pdds[i];
563
564
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
565
continue;
566
567
if (!pdd->dev->kfd->shared_resources.enable_mes)
568
debug_refresh_runlist(pdd->dev->dqm);
569
else
570
kfd_dbg_set_mes_debug_mode(pdd, true);
571
}
572
}
573
574
return r;
575
}
576
577
/* kfd_dbg_trap_deactivate:
578
* target: target process
579
* unwind: If this is unwinding a failed kfd_dbg_trap_enable()
580
* unwind_count:
581
* If unwind == true, how far down the pdd list we need
582
* to unwind
583
* else: ignored
584
*/
585
void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
586
{
587
int i;
588
589
if (!unwind) {
590
uint32_t flags = 0;
591
int resume_count = resume_queues(target, 0, NULL);
592
593
if (resume_count)
594
pr_debug("Resumed %d queues\n", resume_count);
595
596
cancel_work_sync(&target->debug_event_workarea);
597
kfd_dbg_clear_process_address_watch(target);
598
kfd_dbg_trap_set_wave_launch_mode(target, 0);
599
600
kfd_dbg_trap_set_flags(target, &flags);
601
}
602
603
for (i = 0; i < target->n_pdds; i++) {
604
struct kfd_process_device *pdd = target->pdds[i];
605
606
/* If this is an unwind, and we have unwound the required
607
* enable calls on the pdd list, we need to stop now
608
* otherwise we may mess up another debugger session.
609
*/
610
if (unwind && i == unwind_count)
611
break;
612
613
kfd_process_set_trap_debug_flag(&pdd->qpd, false);
614
615
/* GFX off is already disabled by debug activate if not RLC restore supported. */
616
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
617
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
618
pdd->spi_dbg_override =
619
pdd->dev->kfd2kgd->disable_debug_trap(
620
pdd->dev->adev,
621
target->runtime_info.ttmp_setup,
622
pdd->dev->vm_info.last_vmid_kfd);
623
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
624
625
if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
626
release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
627
pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
628
629
if (!pdd->dev->kfd->shared_resources.enable_mes)
630
debug_refresh_runlist(pdd->dev->dqm);
631
else
632
kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
633
}
634
635
kfd_dbg_set_workaround(target, false);
636
}
637
638
static void kfd_dbg_clean_exception_status(struct kfd_process *target)
639
{
640
struct process_queue_manager *pqm;
641
struct process_queue_node *pqn;
642
int i;
643
644
for (i = 0; i < target->n_pdds; i++) {
645
struct kfd_process_device *pdd = target->pdds[i];
646
647
kfd_process_drain_interrupts(pdd);
648
649
pdd->exception_status = 0;
650
}
651
652
pqm = &target->pqm;
653
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
654
if (!pqn->q)
655
continue;
656
657
pqn->q->properties.exception_status = 0;
658
}
659
660
target->exception_status = 0;
661
}
662
663
int kfd_dbg_trap_disable(struct kfd_process *target)
664
{
665
if (!target->debug_trap_enabled)
666
return 0;
667
668
/*
669
* Defer deactivation to runtime if runtime not enabled otherwise reset
670
* attached running target runtime state to enable for re-attach.
671
*/
672
if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
673
kfd_dbg_trap_deactivate(target, false, 0);
674
else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
675
target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
676
677
cancel_work_sync(&target->debug_event_workarea);
678
fput(target->dbg_ev_file);
679
target->dbg_ev_file = NULL;
680
681
if (target->debugger_process) {
682
atomic_dec(&target->debugger_process->debugged_process_count);
683
target->debugger_process = NULL;
684
}
685
686
target->debug_trap_enabled = false;
687
kfd_dbg_clean_exception_status(target);
688
kfd_unref_process(target);
689
690
return 0;
691
}
692
693
int kfd_dbg_trap_activate(struct kfd_process *target)
694
{
695
int i, r = 0;
696
697
r = kfd_dbg_set_workaround(target, true);
698
if (r)
699
return r;
700
701
for (i = 0; i < target->n_pdds; i++) {
702
struct kfd_process_device *pdd = target->pdds[i];
703
704
if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
705
r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
706
707
if (r) {
708
target->runtime_info.runtime_state = (r == -EBUSY) ?
709
DEBUG_RUNTIME_STATE_ENABLED_BUSY :
710
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
711
712
goto unwind_err;
713
}
714
}
715
716
/* Disable GFX OFF to prevent garbage read/writes to debug registers.
717
* If RLC restore of debug registers is not supported and runtime enable
718
* hasn't done so already on ttmp setup request, restore the trap config registers.
719
*
720
* If RLC restore of debug registers is not supported, keep gfx off disabled for
721
* the debug session.
722
*/
723
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
724
if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
725
target->runtime_info.ttmp_setup))
726
pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
727
pdd->dev->vm_info.last_vmid_kfd);
728
729
pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
730
pdd->dev->adev,
731
false,
732
pdd->dev->vm_info.last_vmid_kfd);
733
734
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
735
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
736
737
/*
738
* Setting the debug flag in the trap handler requires that the TMA has been
739
* allocated, which occurs during CWSR initialization.
740
* In the event that CWSR has not been initialized at this point, setting the
741
* flag will be called again during CWSR initialization if the target process
742
* is still debug enabled.
743
*/
744
kfd_process_set_trap_debug_flag(&pdd->qpd, true);
745
746
if (!pdd->dev->kfd->shared_resources.enable_mes)
747
r = debug_refresh_runlist(pdd->dev->dqm);
748
else
749
r = kfd_dbg_set_mes_debug_mode(pdd, true);
750
751
if (r) {
752
target->runtime_info.runtime_state =
753
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
754
goto unwind_err;
755
}
756
}
757
758
return 0;
759
760
unwind_err:
761
/* Enabling debug failed, we need to disable on
762
* all GPUs so the enable is all or nothing.
763
*/
764
kfd_dbg_trap_deactivate(target, true, i);
765
return r;
766
}
767
768
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
769
void __user *runtime_info, uint32_t *runtime_size)
770
{
771
struct file *f;
772
uint32_t copy_size;
773
int i, r = 0;
774
775
if (target->debug_trap_enabled)
776
return -EALREADY;
777
778
/* Enable pre-checks */
779
for (i = 0; i < target->n_pdds; i++) {
780
struct kfd_process_device *pdd = target->pdds[i];
781
782
if (!KFD_IS_SOC15(pdd->dev))
783
return -ENODEV;
784
785
if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
786
kfd_dbg_has_cwsr_workaround(pdd->dev)))
787
return -EBUSY;
788
}
789
790
copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
791
792
f = fget(fd);
793
if (!f) {
794
pr_err("Failed to get file for (%i)\n", fd);
795
return -EBADF;
796
}
797
798
target->dbg_ev_file = f;
799
800
/* defer activation to runtime if not runtime enabled */
801
if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
802
kfd_dbg_trap_activate(target);
803
804
/* We already hold the process reference but hold another one for the
805
* debug session.
806
*/
807
kref_get(&target->ref);
808
target->debug_trap_enabled = true;
809
810
if (target->debugger_process)
811
atomic_inc(&target->debugger_process->debugged_process_count);
812
813
if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
814
kfd_dbg_trap_deactivate(target, false, 0);
815
r = -EFAULT;
816
}
817
818
*runtime_size = sizeof(target->runtime_info);
819
820
return r;
821
}
822
823
static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
824
uint32_t trap_override,
825
uint32_t trap_mask_request,
826
uint32_t *trap_mask_supported)
827
{
828
int i = 0;
829
830
*trap_mask_supported = 0xffffffff;
831
832
for (i = 0; i < p->n_pdds; i++) {
833
struct kfd_process_device *pdd = p->pdds[i];
834
int err = pdd->dev->kfd2kgd->validate_trap_override_request(
835
pdd->dev->adev,
836
trap_override,
837
trap_mask_supported);
838
839
if (err)
840
return err;
841
}
842
843
if (trap_mask_request & ~*trap_mask_supported)
844
return -EACCES;
845
846
return 0;
847
}
848
849
int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
850
uint32_t trap_override,
851
uint32_t trap_mask_bits,
852
uint32_t trap_mask_request,
853
uint32_t *trap_mask_prev,
854
uint32_t *trap_mask_supported)
855
{
856
int r = 0, i;
857
858
r = kfd_dbg_validate_trap_override_request(target,
859
trap_override,
860
trap_mask_request,
861
trap_mask_supported);
862
863
if (r)
864
return r;
865
866
for (i = 0; i < target->n_pdds; i++) {
867
struct kfd_process_device *pdd = target->pdds[i];
868
869
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
870
pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
871
pdd->dev->adev,
872
pdd->dev->vm_info.last_vmid_kfd,
873
trap_override,
874
trap_mask_bits,
875
trap_mask_request,
876
trap_mask_prev,
877
pdd->spi_dbg_override);
878
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
879
880
if (!pdd->dev->kfd->shared_resources.enable_mes)
881
r = debug_refresh_runlist(pdd->dev->dqm);
882
else
883
r = kfd_dbg_set_mes_debug_mode(pdd, true);
884
885
if (r)
886
break;
887
}
888
889
return r;
890
}
891
892
int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
893
uint8_t wave_launch_mode)
894
{
895
int r = 0, i;
896
897
if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
898
wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
899
wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
900
return -EINVAL;
901
902
for (i = 0; i < target->n_pdds; i++) {
903
struct kfd_process_device *pdd = target->pdds[i];
904
905
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
906
pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
907
pdd->dev->adev,
908
wave_launch_mode,
909
pdd->dev->vm_info.last_vmid_kfd);
910
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
911
912
if (!pdd->dev->kfd->shared_resources.enable_mes)
913
r = debug_refresh_runlist(pdd->dev->dqm);
914
else
915
r = kfd_dbg_set_mes_debug_mode(pdd, true);
916
917
if (r)
918
break;
919
}
920
921
return r;
922
}
923
924
int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
925
uint32_t source_id,
926
uint32_t exception_code,
927
bool clear_exception,
928
void __user *info,
929
uint32_t *info_size)
930
{
931
bool found = false;
932
int r = 0;
933
uint32_t copy_size, actual_info_size = 0;
934
uint64_t *exception_status_ptr = NULL;
935
936
if (!target)
937
return -EINVAL;
938
939
if (!info || !info_size)
940
return -EINVAL;
941
942
mutex_lock(&target->event_mutex);
943
944
if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
945
/* Per queue exceptions */
946
struct queue *queue = NULL;
947
int i;
948
949
for (i = 0; i < target->n_pdds; i++) {
950
struct kfd_process_device *pdd = target->pdds[i];
951
struct qcm_process_device *qpd = &pdd->qpd;
952
953
list_for_each_entry(queue, &qpd->queues_list, list) {
954
if (!found && queue->properties.queue_id == source_id) {
955
found = true;
956
break;
957
}
958
}
959
if (found)
960
break;
961
}
962
963
if (!found) {
964
r = -EINVAL;
965
goto out;
966
}
967
968
if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
969
r = -ENODATA;
970
goto out;
971
}
972
exception_status_ptr = &queue->properties.exception_status;
973
} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
974
/* Per device exceptions */
975
struct kfd_process_device *pdd = NULL;
976
int i;
977
978
for (i = 0; i < target->n_pdds; i++) {
979
pdd = target->pdds[i];
980
if (pdd->dev->id == source_id) {
981
found = true;
982
break;
983
}
984
}
985
986
if (!found) {
987
r = -EINVAL;
988
goto out;
989
}
990
991
if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
992
r = -ENODATA;
993
goto out;
994
}
995
996
if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
997
copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
998
999
if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
1000
r = -EFAULT;
1001
goto out;
1002
}
1003
actual_info_size = pdd->vm_fault_exc_data_size;
1004
if (clear_exception) {
1005
kfree(pdd->vm_fault_exc_data);
1006
pdd->vm_fault_exc_data = NULL;
1007
pdd->vm_fault_exc_data_size = 0;
1008
}
1009
}
1010
exception_status_ptr = &pdd->exception_status;
1011
} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1012
/* Per process exceptions */
1013
if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1014
r = -ENODATA;
1015
goto out;
1016
}
1017
1018
if (exception_code == EC_PROCESS_RUNTIME) {
1019
copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1020
1021
if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1022
r = -EFAULT;
1023
goto out;
1024
}
1025
1026
actual_info_size = sizeof(target->runtime_info);
1027
}
1028
1029
exception_status_ptr = &target->exception_status;
1030
} else {
1031
pr_debug("Bad exception type [%i]\n", exception_code);
1032
r = -EINVAL;
1033
goto out;
1034
}
1035
1036
*info_size = actual_info_size;
1037
if (clear_exception)
1038
*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1039
out:
1040
mutex_unlock(&target->event_mutex);
1041
return r;
1042
}
1043
1044
int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1045
uint64_t exception_clear_mask,
1046
void __user *user_info,
1047
uint32_t *number_of_device_infos,
1048
uint32_t *entry_size)
1049
{
1050
struct kfd_dbg_device_info_entry device_info;
1051
uint32_t tmp_entry_size, tmp_num_devices;
1052
int i, r = 0;
1053
1054
if (!(target && user_info && number_of_device_infos && entry_size))
1055
return -EINVAL;
1056
1057
tmp_entry_size = *entry_size;
1058
1059
tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1060
*number_of_device_infos = target->n_pdds;
1061
*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1062
1063
if (!tmp_num_devices)
1064
return 0;
1065
1066
memset(&device_info, 0, sizeof(device_info));
1067
1068
mutex_lock(&target->event_mutex);
1069
1070
/* Run over all pdd of the process */
1071
for (i = 0; i < tmp_num_devices; i++) {
1072
struct kfd_process_device *pdd = target->pdds[i];
1073
struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1074
1075
device_info.gpu_id = pdd->dev->id;
1076
device_info.exception_status = pdd->exception_status;
1077
device_info.lds_base = pdd->lds_base;
1078
device_info.lds_limit = pdd->lds_limit;
1079
device_info.scratch_base = pdd->scratch_base;
1080
device_info.scratch_limit = pdd->scratch_limit;
1081
device_info.gpuvm_base = pdd->gpuvm_base;
1082
device_info.gpuvm_limit = pdd->gpuvm_limit;
1083
device_info.location_id = topo_dev->node_props.location_id;
1084
device_info.vendor_id = topo_dev->node_props.vendor_id;
1085
device_info.device_id = topo_dev->node_props.device_id;
1086
device_info.revision_id = pdd->dev->adev->pdev->revision;
1087
device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1088
device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1089
device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1090
device_info.gfx_target_version =
1091
topo_dev->node_props.gfx_target_version;
1092
device_info.simd_count = topo_dev->node_props.simd_count;
1093
device_info.max_waves_per_simd =
1094
topo_dev->node_props.max_waves_per_simd;
1095
device_info.array_count = topo_dev->node_props.array_count;
1096
device_info.simd_arrays_per_engine =
1097
topo_dev->node_props.simd_arrays_per_engine;
1098
device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1099
device_info.capability = topo_dev->node_props.capability;
1100
device_info.debug_prop = topo_dev->node_props.debug_prop;
1101
1102
if (exception_clear_mask)
1103
pdd->exception_status &= ~exception_clear_mask;
1104
1105
if (copy_to_user(user_info, &device_info, *entry_size)) {
1106
r = -EFAULT;
1107
break;
1108
}
1109
1110
user_info += tmp_entry_size;
1111
}
1112
1113
mutex_unlock(&target->event_mutex);
1114
1115
return r;
1116
}
1117
1118
void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1119
uint64_t exception_set_mask)
1120
{
1121
uint64_t found_mask = 0;
1122
struct process_queue_manager *pqm;
1123
struct process_queue_node *pqn;
1124
static const char write_data = '.';
1125
loff_t pos = 0;
1126
int i;
1127
1128
mutex_lock(&target->event_mutex);
1129
1130
found_mask |= target->exception_status;
1131
1132
pqm = &target->pqm;
1133
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1134
if (!pqn->q)
1135
continue;
1136
1137
found_mask |= pqn->q->properties.exception_status;
1138
}
1139
1140
for (i = 0; i < target->n_pdds; i++) {
1141
struct kfd_process_device *pdd = target->pdds[i];
1142
1143
found_mask |= pdd->exception_status;
1144
}
1145
1146
if (exception_set_mask & found_mask)
1147
kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1148
1149
target->exception_enable_mask = exception_set_mask;
1150
1151
mutex_unlock(&target->event_mutex);
1152
}
1153
1154