Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/ivpu/ivpu_pm.c
26428 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Copyright (C) 2020-2024 Intel Corporation
4
*/
5
6
#include <linux/highmem.h>
7
#include <linux/moduleparam.h>
8
#include <linux/pci.h>
9
#include <linux/pm_runtime.h>
10
#include <linux/reboot.h>
11
12
#include "ivpu_coredump.h"
13
#include "ivpu_drv.h"
14
#include "ivpu_fw.h"
15
#include "ivpu_fw_log.h"
16
#include "ivpu_hw.h"
17
#include "ivpu_ipc.h"
18
#include "ivpu_job.h"
19
#include "ivpu_jsm_msg.h"
20
#include "ivpu_mmu.h"
21
#include "ivpu_ms.h"
22
#include "ivpu_pm.h"
23
#include "ivpu_trace.h"
24
#include "vpu_boot_api.h"
25
26
static bool ivpu_disable_recovery;
27
#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
28
module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
29
MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
30
#endif
31
32
static unsigned long ivpu_tdr_timeout_ms;
33
module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
34
MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
35
36
static unsigned long ivpu_inference_timeout_ms;
37
module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
38
MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
39
40
#define PM_RESCHEDULE_LIMIT 5
41
42
static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
43
{
44
struct ivpu_fw_info *fw = vdev->fw;
45
46
ivpu_cmdq_reset_all_contexts(vdev);
47
ivpu_ipc_reset(vdev);
48
ivpu_fw_log_reset(vdev);
49
ivpu_fw_load(vdev);
50
fw->entry_point = fw->cold_boot_entry_point;
51
fw->last_heartbeat = 0;
52
}
53
54
static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
55
{
56
struct ivpu_fw_info *fw = vdev->fw;
57
struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
58
59
if (!bp->save_restore_ret_address) {
60
ivpu_pm_prepare_cold_boot(vdev);
61
return;
62
}
63
64
ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address);
65
fw->entry_point = bp->save_restore_ret_address;
66
}
67
68
static int ivpu_suspend(struct ivpu_device *vdev)
69
{
70
int ret;
71
72
ivpu_prepare_for_reset(vdev);
73
74
ret = ivpu_shutdown(vdev);
75
if (ret)
76
ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
77
78
return ret;
79
}
80
81
static int ivpu_resume(struct ivpu_device *vdev)
82
{
83
int ret;
84
85
retry:
86
pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
87
pci_restore_state(to_pci_dev(vdev->drm.dev));
88
89
ret = ivpu_hw_power_up(vdev);
90
if (ret) {
91
ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
92
goto err_power_down;
93
}
94
95
ret = ivpu_mmu_enable(vdev);
96
if (ret) {
97
ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
98
goto err_power_down;
99
}
100
101
ret = ivpu_boot(vdev);
102
if (ret)
103
goto err_mmu_disable;
104
105
return 0;
106
107
err_mmu_disable:
108
ivpu_mmu_disable(vdev);
109
err_power_down:
110
ivpu_hw_power_down(vdev);
111
pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
112
113
if (!ivpu_fw_is_cold_boot(vdev)) {
114
ivpu_pm_prepare_cold_boot(vdev);
115
goto retry;
116
} else {
117
ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
118
}
119
120
return ret;
121
}
122
123
static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
124
{
125
pm_runtime_disable(vdev->drm.dev);
126
127
atomic_inc(&vdev->pm->reset_counter);
128
atomic_set(&vdev->pm->reset_pending, 1);
129
down_write(&vdev->pm->reset_lock);
130
}
131
132
static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
133
{
134
int ret;
135
136
ivpu_pm_prepare_cold_boot(vdev);
137
ivpu_jobs_abort_all(vdev);
138
ivpu_ms_cleanup_all(vdev);
139
140
ret = ivpu_resume(vdev);
141
if (ret) {
142
ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
143
pm_runtime_set_suspended(vdev->drm.dev);
144
} else {
145
pm_runtime_set_active(vdev->drm.dev);
146
}
147
148
up_write(&vdev->pm->reset_lock);
149
atomic_set(&vdev->pm->reset_pending, 0);
150
151
pm_runtime_mark_last_busy(vdev->drm.dev);
152
pm_runtime_enable(vdev->drm.dev);
153
}
154
155
static void ivpu_pm_recovery_work(struct work_struct *work)
156
{
157
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
158
struct ivpu_device *vdev = pm->vdev;
159
char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
160
161
ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
162
163
ivpu_pm_reset_begin(vdev);
164
165
if (!pm_runtime_status_suspended(vdev->drm.dev)) {
166
ivpu_jsm_state_dump(vdev);
167
ivpu_dev_coredump(vdev);
168
ivpu_suspend(vdev);
169
}
170
171
ivpu_pm_reset_complete(vdev);
172
173
kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
174
}
175
176
void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
177
{
178
ivpu_err(vdev, "Recovery triggered by %s\n", reason);
179
180
if (ivpu_disable_recovery) {
181
ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
182
return;
183
}
184
185
/* Trigger recovery if it's not in progress */
186
if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
187
ivpu_hw_diagnose_failure(vdev);
188
ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
189
queue_work(system_unbound_wq, &vdev->pm->recovery_work);
190
}
191
}
192
193
static void ivpu_job_timeout_work(struct work_struct *work)
194
{
195
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
196
struct ivpu_device *vdev = pm->vdev;
197
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
198
unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
199
vdev->timeout.inference;
200
u64 inference_max_retries;
201
u64 heartbeat;
202
203
if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
204
ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
205
goto recovery;
206
}
207
208
inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
209
if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
210
ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
211
inference_max_retries);
212
goto recovery;
213
}
214
215
vdev->fw->last_heartbeat = heartbeat;
216
ivpu_start_job_timeout_detection(vdev);
217
return;
218
219
recovery:
220
atomic_set(&vdev->job_timeout_counter, 0);
221
ivpu_pm_trigger_recovery(vdev, "TDR");
222
}
223
224
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
225
{
226
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
227
228
/* No-op if already queued */
229
queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
230
}
231
232
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
233
{
234
cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
235
atomic_set(&vdev->job_timeout_counter, 0);
236
}
237
238
int ivpu_pm_suspend_cb(struct device *dev)
239
{
240
struct drm_device *drm = dev_get_drvdata(dev);
241
struct ivpu_device *vdev = to_ivpu_device(drm);
242
unsigned long timeout;
243
244
trace_pm("suspend");
245
ivpu_dbg(vdev, PM, "Suspend..\n");
246
247
timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
248
while (!ivpu_hw_is_idle(vdev)) {
249
cond_resched();
250
if (time_after_eq(jiffies, timeout)) {
251
ivpu_err(vdev, "Failed to enter idle on system suspend\n");
252
return -EBUSY;
253
}
254
}
255
256
ivpu_jsm_pwr_d0i3_enter(vdev);
257
258
ivpu_suspend(vdev);
259
ivpu_pm_prepare_warm_boot(vdev);
260
261
ivpu_dbg(vdev, PM, "Suspend done.\n");
262
trace_pm("suspend done");
263
264
return 0;
265
}
266
267
int ivpu_pm_resume_cb(struct device *dev)
268
{
269
struct drm_device *drm = dev_get_drvdata(dev);
270
struct ivpu_device *vdev = to_ivpu_device(drm);
271
int ret;
272
273
trace_pm("resume");
274
ivpu_dbg(vdev, PM, "Resume..\n");
275
276
ret = ivpu_resume(vdev);
277
if (ret)
278
ivpu_err(vdev, "Failed to resume: %d\n", ret);
279
280
ivpu_dbg(vdev, PM, "Resume done.\n");
281
trace_pm("resume done");
282
283
return ret;
284
}
285
286
int ivpu_pm_runtime_suspend_cb(struct device *dev)
287
{
288
struct drm_device *drm = dev_get_drvdata(dev);
289
struct ivpu_device *vdev = to_ivpu_device(drm);
290
int ret, ret_d0i3;
291
bool is_idle;
292
293
drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
294
drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
295
296
trace_pm("runtime suspend");
297
ivpu_dbg(vdev, PM, "Runtime suspend..\n");
298
299
ivpu_mmu_disable(vdev);
300
301
is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
302
if (!is_idle)
303
ivpu_err(vdev, "NPU is not idle before autosuspend\n");
304
305
ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
306
if (ret_d0i3)
307
ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
308
309
ret = ivpu_suspend(vdev);
310
if (ret)
311
ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
312
313
if (!is_idle || ret_d0i3) {
314
ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
315
atomic_inc(&vdev->pm->reset_counter);
316
ivpu_dev_coredump(vdev);
317
ivpu_pm_prepare_cold_boot(vdev);
318
} else {
319
ivpu_pm_prepare_warm_boot(vdev);
320
}
321
322
ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
323
trace_pm("runtime suspend done");
324
325
return 0;
326
}
327
328
int ivpu_pm_runtime_resume_cb(struct device *dev)
329
{
330
struct drm_device *drm = dev_get_drvdata(dev);
331
struct ivpu_device *vdev = to_ivpu_device(drm);
332
int ret;
333
334
trace_pm("runtime resume");
335
ivpu_dbg(vdev, PM, "Runtime resume..\n");
336
337
ret = ivpu_resume(vdev);
338
if (ret)
339
ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
340
341
ivpu_dbg(vdev, PM, "Runtime resume done.\n");
342
trace_pm("runtime resume done");
343
344
return ret;
345
}
346
347
int ivpu_rpm_get(struct ivpu_device *vdev)
348
{
349
int ret;
350
351
ret = pm_runtime_resume_and_get(vdev->drm.dev);
352
if (ret < 0) {
353
ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
354
pm_runtime_set_suspended(vdev->drm.dev);
355
}
356
357
return ret;
358
}
359
360
void ivpu_rpm_put(struct ivpu_device *vdev)
361
{
362
pm_runtime_mark_last_busy(vdev->drm.dev);
363
pm_runtime_put_autosuspend(vdev->drm.dev);
364
}
365
366
void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
367
{
368
struct ivpu_device *vdev = pci_get_drvdata(pdev);
369
370
ivpu_dbg(vdev, PM, "Pre-reset..\n");
371
372
ivpu_pm_reset_begin(vdev);
373
374
if (!pm_runtime_status_suspended(vdev->drm.dev)) {
375
ivpu_prepare_for_reset(vdev);
376
ivpu_hw_reset(vdev);
377
}
378
379
ivpu_dbg(vdev, PM, "Pre-reset done.\n");
380
}
381
382
void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
383
{
384
struct ivpu_device *vdev = pci_get_drvdata(pdev);
385
386
ivpu_dbg(vdev, PM, "Post-reset..\n");
387
388
ivpu_pm_reset_complete(vdev);
389
390
ivpu_dbg(vdev, PM, "Post-reset done.\n");
391
}
392
393
void ivpu_pm_init(struct ivpu_device *vdev)
394
{
395
struct device *dev = vdev->drm.dev;
396
struct ivpu_pm_info *pm = vdev->pm;
397
int delay;
398
399
pm->vdev = vdev;
400
401
init_rwsem(&pm->reset_lock);
402
atomic_set(&pm->reset_pending, 0);
403
atomic_set(&pm->reset_counter, 0);
404
405
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
406
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
407
408
if (ivpu_disable_recovery)
409
delay = -1;
410
else
411
delay = vdev->timeout.autosuspend;
412
413
pm_runtime_use_autosuspend(dev);
414
pm_runtime_set_autosuspend_delay(dev, delay);
415
pm_runtime_set_active(dev);
416
417
ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
418
}
419
420
void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
421
{
422
drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
423
disable_work_sync(&vdev->pm->recovery_work);
424
}
425
426
void ivpu_pm_enable(struct ivpu_device *vdev)
427
{
428
struct device *dev = vdev->drm.dev;
429
430
pm_runtime_allow(dev);
431
pm_runtime_mark_last_busy(dev);
432
pm_runtime_put_autosuspend(dev);
433
}
434
435
void ivpu_pm_disable(struct ivpu_device *vdev)
436
{
437
pm_runtime_get_noresume(vdev->drm.dev);
438
pm_runtime_forbid(vdev->drm.dev);
439
}
440
441
int ivpu_pm_dct_init(struct ivpu_device *vdev)
442
{
443
if (vdev->pm->dct_active_percent)
444
return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
445
446
return 0;
447
}
448
449
int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
450
{
451
u32 active_us, inactive_us;
452
int ret;
453
454
if (active_percent == 0 || active_percent > 100)
455
return -EINVAL;
456
457
active_us = (DCT_PERIOD_US * active_percent) / 100;
458
inactive_us = DCT_PERIOD_US - active_us;
459
460
vdev->pm->dct_active_percent = active_percent;
461
462
ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
463
active_percent, active_us, inactive_us);
464
465
ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
466
if (ret) {
467
ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
468
return ret;
469
}
470
471
return 0;
472
}
473
474
int ivpu_pm_dct_disable(struct ivpu_device *vdev)
475
{
476
int ret;
477
478
vdev->pm->dct_active_percent = 0;
479
480
ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
481
482
ret = ivpu_jsm_dct_disable(vdev);
483
if (ret) {
484
ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
485
return ret;
486
}
487
488
return 0;
489
}
490
491
void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
492
{
493
struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
494
bool enable;
495
int ret;
496
497
if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
498
return;
499
500
if (enable)
501
ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
502
else
503
ret = ivpu_pm_dct_disable(vdev);
504
505
if (!ret)
506
ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
507
}
508
509