Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/ivpu/ivpu_pm.c
51375 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Copyright (C) 2020-2024 Intel Corporation
4
*/
5
6
#include <linux/highmem.h>
7
#include <linux/moduleparam.h>
8
#include <linux/pci.h>
9
#include <linux/pm_runtime.h>
10
#include <linux/reboot.h>
11
12
#include "ivpu_coredump.h"
13
#include "ivpu_drv.h"
14
#include "ivpu_fw.h"
15
#include "ivpu_fw_log.h"
16
#include "ivpu_hw.h"
17
#include "ivpu_ipc.h"
18
#include "ivpu_job.h"
19
#include "ivpu_jsm_msg.h"
20
#include "ivpu_mmu.h"
21
#include "ivpu_ms.h"
22
#include "ivpu_pm.h"
23
#include "ivpu_trace.h"
24
#include "vpu_boot_api.h"
25
26
static bool ivpu_disable_recovery;
27
#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
28
module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
29
MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
30
#endif
31
32
static unsigned long ivpu_tdr_timeout_ms;
33
module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
34
MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
35
36
static unsigned long ivpu_inference_timeout_ms;
37
module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
38
MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
39
40
#define PM_RESCHEDULE_LIMIT 5
41
42
static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
43
{
44
struct ivpu_fw_info *fw = vdev->fw;
45
46
ivpu_cmdq_reset_all_contexts(vdev);
47
ivpu_ipc_reset(vdev);
48
ivpu_fw_log_reset(vdev);
49
ivpu_fw_load(vdev);
50
fw->last_heartbeat = 0;
51
52
ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point);
53
fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT;
54
}
55
56
static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
57
{
58
struct ivpu_fw_info *fw = vdev->fw;
59
struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
60
61
fw->warm_boot_entry_point = bp->save_restore_ret_address;
62
if (!fw->warm_boot_entry_point) {
63
ivpu_pm_prepare_cold_boot(vdev);
64
return;
65
}
66
67
ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point);
68
fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT;
69
}
70
71
static int ivpu_suspend(struct ivpu_device *vdev)
72
{
73
int ret;
74
75
ivpu_prepare_for_reset(vdev);
76
77
ret = ivpu_shutdown(vdev);
78
if (ret)
79
ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
80
81
return ret;
82
}
83
84
static int ivpu_resume(struct ivpu_device *vdev)
85
{
86
int ret;
87
88
retry:
89
pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
90
pci_restore_state(to_pci_dev(vdev->drm.dev));
91
92
ret = ivpu_hw_power_up(vdev);
93
if (ret) {
94
ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
95
goto err_power_down;
96
}
97
98
ret = ivpu_mmu_enable(vdev);
99
if (ret) {
100
ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
101
goto err_power_down;
102
}
103
104
ret = ivpu_boot(vdev);
105
if (ret)
106
goto err_mmu_disable;
107
108
return 0;
109
110
err_mmu_disable:
111
ivpu_mmu_disable(vdev);
112
err_power_down:
113
ivpu_hw_power_down(vdev);
114
pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
115
116
if (ivpu_fw_is_warm_boot(vdev)) {
117
ivpu_pm_prepare_cold_boot(vdev);
118
goto retry;
119
} else {
120
ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
121
}
122
123
return ret;
124
}
125
126
static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
127
{
128
pm_runtime_disable(vdev->drm.dev);
129
130
atomic_inc(&vdev->pm->reset_counter);
131
atomic_set(&vdev->pm->reset_pending, 1);
132
down_write(&vdev->pm->reset_lock);
133
}
134
135
static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
136
{
137
int ret;
138
139
ivpu_pm_prepare_cold_boot(vdev);
140
ivpu_jobs_abort_all(vdev);
141
ivpu_ms_cleanup_all(vdev);
142
143
ret = ivpu_resume(vdev);
144
if (ret) {
145
ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
146
pm_runtime_set_suspended(vdev->drm.dev);
147
} else {
148
pm_runtime_set_active(vdev->drm.dev);
149
}
150
151
up_write(&vdev->pm->reset_lock);
152
atomic_set(&vdev->pm->reset_pending, 0);
153
154
pm_runtime_mark_last_busy(vdev->drm.dev);
155
pm_runtime_enable(vdev->drm.dev);
156
}
157
158
static void ivpu_pm_recovery_work(struct work_struct *work)
159
{
160
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
161
struct ivpu_device *vdev = pm->vdev;
162
char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
163
164
ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
165
166
ivpu_pm_reset_begin(vdev);
167
168
if (!pm_runtime_status_suspended(vdev->drm.dev)) {
169
ivpu_jsm_state_dump(vdev);
170
ivpu_dev_coredump(vdev);
171
ivpu_suspend(vdev);
172
}
173
174
ivpu_pm_reset_complete(vdev);
175
176
kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
177
}
178
179
void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
180
{
181
ivpu_err(vdev, "Recovery triggered by %s\n", reason);
182
183
if (ivpu_disable_recovery) {
184
ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
185
return;
186
}
187
188
/* Trigger recovery if it's not in progress */
189
if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
190
ivpu_hw_diagnose_failure(vdev);
191
ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
192
queue_work(system_dfl_wq, &vdev->pm->recovery_work);
193
}
194
}
195
196
static void ivpu_job_timeout_work(struct work_struct *work)
197
{
198
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
199
struct ivpu_device *vdev = pm->vdev;
200
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201
unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
202
vdev->timeout.inference;
203
u64 inference_max_retries;
204
u64 heartbeat;
205
206
if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
207
ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
208
goto recovery;
209
}
210
211
inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
212
if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
213
ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
214
inference_max_retries);
215
goto recovery;
216
}
217
218
vdev->fw->last_heartbeat = heartbeat;
219
ivpu_start_job_timeout_detection(vdev);
220
return;
221
222
recovery:
223
atomic_set(&vdev->job_timeout_counter, 0);
224
ivpu_pm_trigger_recovery(vdev, "TDR");
225
}
226
227
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
228
{
229
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
230
231
/* No-op if already queued */
232
queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
233
msecs_to_jiffies(timeout_ms));
234
}
235
236
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
237
{
238
cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
239
atomic_set(&vdev->job_timeout_counter, 0);
240
}
241
242
int ivpu_pm_suspend_cb(struct device *dev)
243
{
244
struct drm_device *drm = dev_get_drvdata(dev);
245
struct ivpu_device *vdev = to_ivpu_device(drm);
246
unsigned long timeout;
247
248
trace_pm("suspend");
249
ivpu_dbg(vdev, PM, "Suspend..\n");
250
251
timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
252
while (!ivpu_hw_is_idle(vdev)) {
253
cond_resched();
254
if (time_after_eq(jiffies, timeout)) {
255
ivpu_err(vdev, "Failed to enter idle on system suspend\n");
256
return -EBUSY;
257
}
258
}
259
260
ivpu_jsm_pwr_d0i3_enter(vdev);
261
262
ivpu_suspend(vdev);
263
ivpu_pm_prepare_warm_boot(vdev);
264
265
ivpu_dbg(vdev, PM, "Suspend done.\n");
266
trace_pm("suspend done");
267
268
return 0;
269
}
270
271
int ivpu_pm_resume_cb(struct device *dev)
272
{
273
struct drm_device *drm = dev_get_drvdata(dev);
274
struct ivpu_device *vdev = to_ivpu_device(drm);
275
int ret;
276
277
trace_pm("resume");
278
ivpu_dbg(vdev, PM, "Resume..\n");
279
280
ret = ivpu_resume(vdev);
281
if (ret)
282
ivpu_err(vdev, "Failed to resume: %d\n", ret);
283
284
ivpu_dbg(vdev, PM, "Resume done.\n");
285
trace_pm("resume done");
286
287
return ret;
288
}
289
290
int ivpu_pm_runtime_suspend_cb(struct device *dev)
291
{
292
struct drm_device *drm = dev_get_drvdata(dev);
293
struct ivpu_device *vdev = to_ivpu_device(drm);
294
int ret, ret_d0i3;
295
bool is_idle;
296
297
drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
298
drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
299
300
trace_pm("runtime suspend");
301
ivpu_dbg(vdev, PM, "Runtime suspend..\n");
302
303
ivpu_mmu_disable(vdev);
304
305
is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
306
if (!is_idle)
307
ivpu_err(vdev, "NPU is not idle before autosuspend\n");
308
309
ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
310
if (ret_d0i3)
311
ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
312
313
ret = ivpu_suspend(vdev);
314
if (ret)
315
ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
316
317
if (!is_idle || ret_d0i3) {
318
ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
319
atomic_inc(&vdev->pm->reset_counter);
320
ivpu_dev_coredump(vdev);
321
ivpu_pm_prepare_cold_boot(vdev);
322
} else {
323
ivpu_pm_prepare_warm_boot(vdev);
324
}
325
326
ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
327
trace_pm("runtime suspend done");
328
329
return 0;
330
}
331
332
int ivpu_pm_runtime_resume_cb(struct device *dev)
333
{
334
struct drm_device *drm = dev_get_drvdata(dev);
335
struct ivpu_device *vdev = to_ivpu_device(drm);
336
int ret;
337
338
trace_pm("runtime resume");
339
ivpu_dbg(vdev, PM, "Runtime resume..\n");
340
341
ret = ivpu_resume(vdev);
342
if (ret)
343
ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
344
345
ivpu_dbg(vdev, PM, "Runtime resume done.\n");
346
trace_pm("runtime resume done");
347
348
return ret;
349
}
350
351
int ivpu_rpm_get(struct ivpu_device *vdev)
352
{
353
int ret;
354
355
ret = pm_runtime_resume_and_get(vdev->drm.dev);
356
if (ret < 0) {
357
ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
358
pm_runtime_set_suspended(vdev->drm.dev);
359
}
360
361
return ret;
362
}
363
364
void ivpu_rpm_put(struct ivpu_device *vdev)
365
{
366
pm_runtime_put_autosuspend(vdev->drm.dev);
367
}
368
369
void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
370
{
371
struct ivpu_device *vdev = pci_get_drvdata(pdev);
372
373
ivpu_dbg(vdev, PM, "Pre-reset..\n");
374
375
ivpu_pm_reset_begin(vdev);
376
377
if (!pm_runtime_status_suspended(vdev->drm.dev)) {
378
ivpu_prepare_for_reset(vdev);
379
ivpu_hw_reset(vdev);
380
}
381
382
ivpu_dbg(vdev, PM, "Pre-reset done.\n");
383
}
384
385
void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
386
{
387
struct ivpu_device *vdev = pci_get_drvdata(pdev);
388
389
ivpu_dbg(vdev, PM, "Post-reset..\n");
390
391
ivpu_pm_reset_complete(vdev);
392
393
ivpu_dbg(vdev, PM, "Post-reset done.\n");
394
}
395
396
void ivpu_pm_init(struct ivpu_device *vdev)
397
{
398
struct device *dev = vdev->drm.dev;
399
struct ivpu_pm_info *pm = vdev->pm;
400
int delay;
401
402
pm->vdev = vdev;
403
404
init_rwsem(&pm->reset_lock);
405
atomic_set(&pm->reset_pending, 0);
406
atomic_set(&pm->reset_counter, 0);
407
408
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
409
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
410
411
if (ivpu_disable_recovery)
412
delay = -1;
413
else
414
delay = vdev->timeout.autosuspend;
415
416
pm_runtime_use_autosuspend(dev);
417
pm_runtime_set_autosuspend_delay(dev, delay);
418
pm_runtime_set_active(dev);
419
420
ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
421
}
422
423
void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
424
{
425
drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
426
disable_work_sync(&vdev->pm->recovery_work);
427
}
428
429
void ivpu_pm_enable(struct ivpu_device *vdev)
430
{
431
struct device *dev = vdev->drm.dev;
432
433
pm_runtime_allow(dev);
434
pm_runtime_put_autosuspend(dev);
435
}
436
437
void ivpu_pm_disable(struct ivpu_device *vdev)
438
{
439
pm_runtime_get_noresume(vdev->drm.dev);
440
pm_runtime_forbid(vdev->drm.dev);
441
}
442
443
int ivpu_pm_dct_init(struct ivpu_device *vdev)
444
{
445
if (vdev->pm->dct_active_percent)
446
return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
447
448
return 0;
449
}
450
451
int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
452
{
453
u32 active_us, inactive_us;
454
int ret;
455
456
if (active_percent == 0 || active_percent > 100)
457
return -EINVAL;
458
459
active_us = (DCT_PERIOD_US * active_percent) / 100;
460
inactive_us = DCT_PERIOD_US - active_us;
461
462
vdev->pm->dct_active_percent = active_percent;
463
464
ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
465
active_percent, active_us, inactive_us);
466
467
ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
468
if (ret) {
469
ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
470
return ret;
471
}
472
473
return 0;
474
}
475
476
int ivpu_pm_dct_disable(struct ivpu_device *vdev)
477
{
478
int ret;
479
480
vdev->pm->dct_active_percent = 0;
481
482
ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
483
484
ret = ivpu_jsm_dct_disable(vdev);
485
if (ret) {
486
ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
487
return ret;
488
}
489
490
return 0;
491
}
492
493
void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
494
{
495
struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
496
bool enable;
497
int ret;
498
499
if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
500
return;
501
502
if (enable)
503
ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
504
else
505
ret = ivpu_pm_dct_disable(vdev);
506
507
if (!ret) {
508
/* Convert percent to U1.7 format */
509
u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
510
511
ivpu_hw_btrs_dct_set_status(vdev, enable, val);
512
}
513
514
}
515
516