Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/cpuidle/cpuidle-pseries.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* cpuidle-pseries - idle state cpuidle driver.
4
* Adapted from drivers/idle/intel_idle.c and
5
* drivers/acpi/processor_idle.c
6
*
7
*/
8
9
#include <linux/kernel.h>
10
#include <linux/module.h>
11
#include <linux/init.h>
12
#include <linux/moduleparam.h>
13
#include <linux/cpuidle.h>
14
#include <linux/cpu.h>
15
#include <linux/notifier.h>
16
17
#include <asm/paca.h>
18
#include <asm/reg.h>
19
#include <asm/machdep.h>
20
#include <asm/firmware.h>
21
#include <asm/runlatch.h>
22
#include <asm/idle.h>
23
#include <asm/plpar_wrappers.h>
24
#include <asm/rtas.h>
25
#include <asm/time.h>
26
27
static struct cpuidle_driver pseries_idle_driver = {
28
.name = "pseries_idle",
29
.owner = THIS_MODULE,
30
};
31
32
static int max_idle_state __read_mostly;
33
static struct cpuidle_state *cpuidle_state_table __read_mostly;
34
static u64 snooze_timeout __read_mostly;
35
static bool snooze_timeout_en __read_mostly;
36
37
static __cpuidle
38
int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
39
int index)
40
{
41
u64 snooze_exit_time;
42
43
set_thread_flag(TIF_POLLING_NRFLAG);
44
45
pseries_idle_prolog();
46
raw_local_irq_enable();
47
snooze_exit_time = get_tb() + snooze_timeout;
48
dev->poll_time_limit = false;
49
50
while (!need_resched()) {
51
HMT_low();
52
HMT_very_low();
53
if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
54
/*
55
* Task has not woken up but we are exiting the polling
56
* loop anyway. Require a barrier after polling is
57
* cleared to order subsequent test of need_resched().
58
*/
59
dev->poll_time_limit = true;
60
clear_thread_flag(TIF_POLLING_NRFLAG);
61
smp_mb();
62
break;
63
}
64
}
65
66
HMT_medium();
67
clear_thread_flag(TIF_POLLING_NRFLAG);
68
69
raw_local_irq_disable();
70
71
pseries_idle_epilog();
72
73
return index;
74
}
75
76
static __cpuidle void check_and_cede_processor(void)
77
{
78
/*
79
* Ensure our interrupt state is properly tracked,
80
* also checks if no interrupt has occurred while we
81
* were soft-disabled
82
*/
83
if (prep_irq_for_idle()) {
84
cede_processor();
85
#ifdef CONFIG_TRACE_IRQFLAGS
86
/* Ensure that H_CEDE returns with IRQs on */
87
if (WARN_ON(!(mfmsr() & MSR_EE)))
88
__hard_irq_enable();
89
#endif
90
}
91
}
92
93
/*
94
* XCEDE: Extended CEDE states discovered through the
95
* "ibm,get-systems-parameter" RTAS call with the token
96
* CEDE_LATENCY_TOKEN
97
*/
98
99
/*
100
* Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
101
* table with all the parameters to ibm,get-system-parameters.
102
* CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
103
* Settings Information.
104
*/
105
#define CEDE_LATENCY_TOKEN 45
106
107
/*
108
* If the platform supports the cede latency settings information system
109
* parameter it must provide the following information in the NULL terminated
110
* parameter string:
111
*
112
* a. The first byte is the length ā€œNā€ of each cede latency setting record minus
113
* one (zero indicates a length of 1 byte).
114
*
115
* b. For each supported cede latency setting a cede latency setting record
116
* consisting of the first ā€œNā€ bytes as per the following table.
117
*
118
* -----------------------------
119
* | Field | Field |
120
* | Name | Length |
121
* -----------------------------
122
* | Cede Latency | 1 Byte |
123
* | Specifier Value | |
124
* -----------------------------
125
* | Maximum wakeup | |
126
* | latency in | 8 Bytes |
127
* | tb-ticks | |
128
* -----------------------------
129
* | Responsive to | |
130
* | external | 1 Byte |
131
* | interrupts | |
132
* -----------------------------
133
*
134
* This version has cede latency record size = 10.
135
*
136
* The structure xcede_latency_payload represents a) and b) with
137
* xcede_latency_record representing the table in b).
138
*
139
* xcede_latency_parameter is what gets returned by
140
* ibm,get-systems-parameter RTAS call when made with
141
* CEDE_LATENCY_TOKEN.
142
*
143
* These structures are only used to represent the data obtained by the RTAS
144
* call. The data is in big-endian.
145
*/
146
struct xcede_latency_record {
147
u8 hint;
148
__be64 latency_ticks;
149
u8 wake_on_irqs;
150
} __packed;
151
152
// Make space for 16 records, which "should be enough".
153
struct xcede_latency_payload {
154
u8 record_size;
155
struct xcede_latency_record records[16];
156
} __packed;
157
158
struct xcede_latency_parameter {
159
__be16 payload_size;
160
struct xcede_latency_payload payload;
161
u8 null_char;
162
} __packed;
163
164
static unsigned int nr_xcede_records;
165
static struct xcede_latency_parameter xcede_latency_parameter __initdata;
166
167
static int __init parse_cede_parameters(void)
168
{
169
struct xcede_latency_payload *payload;
170
u32 total_xcede_records_size;
171
u8 xcede_record_size;
172
u16 payload_size;
173
int ret, i;
174
175
ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
176
NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
177
sizeof(xcede_latency_parameter));
178
if (ret) {
179
pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
180
return ret;
181
}
182
183
payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
184
payload = &xcede_latency_parameter.payload;
185
186
xcede_record_size = payload->record_size + 1;
187
188
if (xcede_record_size != sizeof(struct xcede_latency_record)) {
189
pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
190
sizeof(struct xcede_latency_record), xcede_record_size);
191
return -EINVAL;
192
}
193
194
pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
195
196
/*
197
* Since the payload_size includes the last NULL byte and the
198
* xcede_record_size, the remaining bytes correspond to array of all
199
* cede_latency settings.
200
*/
201
total_xcede_records_size = payload_size - 2;
202
nr_xcede_records = total_xcede_records_size / xcede_record_size;
203
204
for (i = 0; i < nr_xcede_records; i++) {
205
struct xcede_latency_record *record = &payload->records[i];
206
u64 latency_ticks = be64_to_cpu(record->latency_ticks);
207
u8 wake_on_irqs = record->wake_on_irqs;
208
u8 hint = record->hint;
209
210
pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
211
i, hint, latency_ticks, wake_on_irqs);
212
}
213
214
return 0;
215
}
216
217
#define NR_DEDICATED_STATES 2 /* snooze, CEDE */
218
static u8 cede_latency_hint[NR_DEDICATED_STATES];
219
220
static __cpuidle
221
int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
222
int index)
223
{
224
u8 old_latency_hint;
225
226
pseries_idle_prolog();
227
get_lppaca()->donate_dedicated_cpu = 1;
228
old_latency_hint = get_lppaca()->cede_latency_hint;
229
get_lppaca()->cede_latency_hint = cede_latency_hint[index];
230
231
HMT_medium();
232
check_and_cede_processor();
233
234
raw_local_irq_disable();
235
get_lppaca()->donate_dedicated_cpu = 0;
236
get_lppaca()->cede_latency_hint = old_latency_hint;
237
238
pseries_idle_epilog();
239
240
return index;
241
}
242
243
static __cpuidle
244
int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
245
int index)
246
{
247
248
pseries_idle_prolog();
249
250
/*
251
* Yield the processor to the hypervisor. We return if
252
* an external interrupt occurs (which are driven prior
253
* to returning here) or if a prod occurs from another
254
* processor. When returning here, external interrupts
255
* are enabled.
256
*/
257
check_and_cede_processor();
258
259
raw_local_irq_disable();
260
pseries_idle_epilog();
261
262
return index;
263
}
264
265
/*
266
* States for dedicated partition case.
267
*/
268
static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
269
{ /* Snooze */
270
.name = "snooze",
271
.desc = "snooze",
272
.exit_latency = 0,
273
.target_residency = 0,
274
.enter = &snooze_loop,
275
.flags = CPUIDLE_FLAG_POLLING },
276
{ /* CEDE */
277
.name = "CEDE",
278
.desc = "CEDE",
279
.exit_latency = 10,
280
.target_residency = 100,
281
.enter = &dedicated_cede_loop },
282
};
283
284
/*
285
* States for shared partition case.
286
*/
287
static struct cpuidle_state shared_states[] = {
288
{ /* Snooze */
289
.name = "snooze",
290
.desc = "snooze",
291
.exit_latency = 0,
292
.target_residency = 0,
293
.enter = &snooze_loop,
294
.flags = CPUIDLE_FLAG_POLLING },
295
{ /* Shared Cede */
296
.name = "Shared Cede",
297
.desc = "Shared Cede",
298
.exit_latency = 10,
299
.target_residency = 100,
300
.enter = &shared_cede_loop },
301
};
302
303
static int pseries_cpuidle_cpu_online(unsigned int cpu)
304
{
305
struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
306
307
if (dev && cpuidle_get_driver()) {
308
cpuidle_pause_and_lock();
309
cpuidle_enable_device(dev);
310
cpuidle_resume_and_unlock();
311
}
312
return 0;
313
}
314
315
static int pseries_cpuidle_cpu_dead(unsigned int cpu)
316
{
317
struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
318
319
if (dev && cpuidle_get_driver()) {
320
cpuidle_pause_and_lock();
321
cpuidle_disable_device(dev);
322
cpuidle_resume_and_unlock();
323
}
324
return 0;
325
}
326
327
/*
328
* pseries_cpuidle_driver_init()
329
*/
330
static int pseries_cpuidle_driver_init(void)
331
{
332
int idle_state;
333
struct cpuidle_driver *drv = &pseries_idle_driver;
334
335
drv->state_count = 0;
336
337
for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
338
/* Is the state not enabled? */
339
if (cpuidle_state_table[idle_state].enter == NULL)
340
continue;
341
342
drv->states[drv->state_count] = /* structure copy */
343
cpuidle_state_table[idle_state];
344
345
drv->state_count += 1;
346
}
347
348
return 0;
349
}
350
351
static void __init fixup_cede0_latency(void)
352
{
353
struct xcede_latency_payload *payload;
354
u64 min_xcede_latency_us = UINT_MAX;
355
int i;
356
357
if (parse_cede_parameters())
358
return;
359
360
pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
361
nr_xcede_records);
362
363
payload = &xcede_latency_parameter.payload;
364
365
/*
366
* The CEDE idle state maps to CEDE(0). While the hypervisor
367
* does not advertise CEDE(0) exit latency values, it does
368
* advertise the latency values of the extended CEDE states.
369
* We use the lowest advertised exit latency value as a proxy
370
* for the exit latency of CEDE(0).
371
*/
372
for (i = 0; i < nr_xcede_records; i++) {
373
struct xcede_latency_record *record = &payload->records[i];
374
u8 hint = record->hint;
375
u64 latency_tb = be64_to_cpu(record->latency_ticks);
376
u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
377
378
/*
379
* We expect the exit latency of an extended CEDE
380
* state to be non-zero, it to since it takes at least
381
* a few nanoseconds to wakeup the idle CPU and
382
* dispatch the virtual processor into the Linux
383
* Guest.
384
*
385
* So we consider only non-zero value for performing
386
* the fixup of CEDE(0) latency.
387
*/
388
if (latency_us == 0) {
389
pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
390
i, hint);
391
continue;
392
}
393
394
if (latency_us < min_xcede_latency_us)
395
min_xcede_latency_us = latency_us;
396
}
397
398
if (min_xcede_latency_us != UINT_MAX) {
399
dedicated_states[1].exit_latency = min_xcede_latency_us;
400
dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
401
pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
402
min_xcede_latency_us);
403
}
404
405
}
406
407
/*
408
* pseries_idle_probe()
409
* Choose state table for shared versus dedicated partition
410
*/
411
static int __init pseries_idle_probe(void)
412
{
413
414
if (cpuidle_disable != IDLE_NO_OVERRIDE)
415
return -ENODEV;
416
417
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
418
if (lppaca_shared_proc()) {
419
cpuidle_state_table = shared_states;
420
max_idle_state = ARRAY_SIZE(shared_states);
421
} else {
422
/*
423
* Use firmware provided latency values
424
* starting with POWER10 platforms. In the
425
* case that we are running on a POWER10
426
* platform but in an earlier compat mode, we
427
* can still use the firmware provided values.
428
*
429
* However, on platforms prior to POWER10, we
430
* cannot rely on the accuracy of the firmware
431
* provided latency values. On such platforms,
432
* go with the conservative default estimate
433
* of 10us.
434
*/
435
if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
436
fixup_cede0_latency();
437
cpuidle_state_table = dedicated_states;
438
max_idle_state = NR_DEDICATED_STATES;
439
}
440
} else
441
return -ENODEV;
442
443
if (max_idle_state > 1) {
444
snooze_timeout_en = true;
445
snooze_timeout = cpuidle_state_table[1].target_residency *
446
tb_ticks_per_usec;
447
}
448
return 0;
449
}
450
451
static int __init pseries_processor_idle_init(void)
452
{
453
int retval;
454
455
retval = pseries_idle_probe();
456
if (retval)
457
return retval;
458
459
pseries_cpuidle_driver_init();
460
retval = cpuidle_register(&pseries_idle_driver, NULL);
461
if (retval) {
462
printk(KERN_DEBUG "Registration of pseries driver failed.\n");
463
return retval;
464
}
465
466
retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
467
"cpuidle/pseries:online",
468
pseries_cpuidle_cpu_online, NULL);
469
WARN_ON(retval < 0);
470
retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
471
"cpuidle/pseries:DEAD", NULL,
472
pseries_cpuidle_cpu_dead);
473
WARN_ON(retval < 0);
474
printk(KERN_DEBUG "pseries_idle_driver registered\n");
475
return 0;
476
}
477
478
device_initcall(pseries_processor_idle_init);
479
480