CoCalc -- cpuidle-pseries.c

GitHub Repository: torvalds/linux
Path: blob/master/drivers/cpuidle/cpuidle-pseries.c
²⁶²⁷⁸ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 *  cpuidle-pseries - idle state cpuidle driver.
4
 *  Adapted from drivers/idle/intel_idle.c and
5
 *  drivers/acpi/processor_idle.c
6
 *
7
 */
8

9
#include <linux/kernel.h>
10
#include <linux/module.h>
11
#include <linux/init.h>
12
#include <linux/moduleparam.h>
13
#include <linux/cpuidle.h>
14
#include <linux/cpu.h>
15
#include <linux/notifier.h>
16

17
#include <asm/paca.h>
18
#include <asm/reg.h>
19
#include <asm/machdep.h>
20
#include <asm/firmware.h>
21
#include <asm/runlatch.h>
22
#include <asm/idle.h>
23
#include <asm/plpar_wrappers.h>
24
#include <asm/rtas.h>
25
#include <asm/time.h>
26

27
static struct cpuidle_driver pseries_idle_driver = {
28
	.name             = "pseries_idle",
29
	.owner            = THIS_MODULE,
30
};
31

32
static int max_idle_state __read_mostly;
33
static struct cpuidle_state *cpuidle_state_table __read_mostly;
34
static u64 snooze_timeout __read_mostly;
35
static bool snooze_timeout_en __read_mostly;
36

37
static __cpuidle
38
int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
39
		int index)
40
{
41
	u64 snooze_exit_time;
42

43
	set_thread_flag(TIF_POLLING_NRFLAG);
44

45
	pseries_idle_prolog();
46
	raw_local_irq_enable();
47
	snooze_exit_time = get_tb() + snooze_timeout;
48
	dev->poll_time_limit = false;
49

50
	while (!need_resched()) {
51
		HMT_low();
52
		HMT_very_low();
53
		if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
54
			/*
55
			 * Task has not woken up but we are exiting the polling
56
			 * loop anyway. Require a barrier after polling is
57
			 * cleared to order subsequent test of need_resched().
58
			 */
59
			dev->poll_time_limit = true;
60
			clear_thread_flag(TIF_POLLING_NRFLAG);
61
			smp_mb();
62
			break;
63
		}
64
	}
65

66
	HMT_medium();
67
	clear_thread_flag(TIF_POLLING_NRFLAG);
68

69
	raw_local_irq_disable();
70

71
	pseries_idle_epilog();
72

73
	return index;
74
}
75

76
static __cpuidle void check_and_cede_processor(void)
77
{
78
	/*
79
	 * Ensure our interrupt state is properly tracked,
80
	 * also checks if no interrupt has occurred while we
81
	 * were soft-disabled
82
	 */
83
	if (prep_irq_for_idle()) {
84
		cede_processor();
85
#ifdef CONFIG_TRACE_IRQFLAGS
86
		/* Ensure that H_CEDE returns with IRQs on */
87
		if (WARN_ON(!(mfmsr() & MSR_EE)))
88
			__hard_irq_enable();
89
#endif
90
	}
91
}
92

93
/*
94
 * XCEDE: Extended CEDE states discovered through the
95
 *        "ibm,get-systems-parameter" RTAS call with the token
96
 *        CEDE_LATENCY_TOKEN
97
 */
98

99
/*
100
 * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
101
 * table with all the parameters to ibm,get-system-parameters.
102
 * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
103
 * Settings Information.
104
 */
105
#define CEDE_LATENCY_TOKEN	45
106

107
/*
108
 * If the platform supports the cede latency settings information system
109
 * parameter it must provide the following information in the NULL terminated
110
 * parameter string:
111
 *
112
 * a. The first byte is the length “N” of each cede latency setting record minus
113
 *    one (zero indicates a length of 1 byte).
114
 *
115
 * b. For each supported cede latency setting a cede latency setting record
116
 *    consisting of the first “N” bytes as per the following table.
117
 *
118
 *    -----------------------------
119
 *    | Field           | Field   |
120
 *    | Name            | Length  |
121
 *    -----------------------------
122
 *    | Cede Latency    | 1 Byte  |
123
 *    | Specifier Value |         |
124
 *    -----------------------------
125
 *    | Maximum wakeup  |         |
126
 *    | latency in      | 8 Bytes |
127
 *    | tb-ticks        |         |
128
 *    -----------------------------
129
 *    | Responsive to   |         |
130
 *    | external        | 1 Byte  |
131
 *    | interrupts      |         |
132
 *    -----------------------------
133
 *
134
 * This version has cede latency record size = 10.
135
 *
136
 * The structure xcede_latency_payload represents a) and b) with
137
 * xcede_latency_record representing the table in b).
138
 *
139
 * xcede_latency_parameter is what gets returned by
140
 * ibm,get-systems-parameter RTAS call when made with
141
 * CEDE_LATENCY_TOKEN.
142
 *
143
 * These structures are only used to represent the data obtained by the RTAS
144
 * call. The data is in big-endian.
145
 */
146
struct xcede_latency_record {
147
	u8	hint;
148
	__be64	latency_ticks;
149
	u8	wake_on_irqs;
150
} __packed;
151

152
// Make space for 16 records, which "should be enough".
153
struct xcede_latency_payload {
154
	u8     record_size;
155
	struct xcede_latency_record records[16];
156
} __packed;
157

158
struct xcede_latency_parameter {
159
	__be16  payload_size;
160
	struct xcede_latency_payload payload;
161
	u8 null_char;
162
} __packed;
163

164
static unsigned int nr_xcede_records;
165
static struct xcede_latency_parameter xcede_latency_parameter __initdata;
166

167
static int __init parse_cede_parameters(void)
168
{
169
	struct xcede_latency_payload *payload;
170
	u32 total_xcede_records_size;
171
	u8 xcede_record_size;
172
	u16 payload_size;
173
	int ret, i;
174

175
	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
176
			NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
177
			sizeof(xcede_latency_parameter));
178
	if (ret) {
179
		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
180
		return ret;
181
	}
182

183
	payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
184
	payload = &xcede_latency_parameter.payload;
185

186
	xcede_record_size = payload->record_size + 1;
187

188
	if (xcede_record_size != sizeof(struct xcede_latency_record)) {
189
		pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
190
		       sizeof(struct xcede_latency_record), xcede_record_size);
191
		return -EINVAL;
192
	}
193

194
	pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
195

196
	/*
197
	 * Since the payload_size includes the last NULL byte and the
198
	 * xcede_record_size, the remaining bytes correspond to array of all
199
	 * cede_latency settings.
200
	 */
201
	total_xcede_records_size = payload_size - 2;
202
	nr_xcede_records = total_xcede_records_size / xcede_record_size;
203

204
	for (i = 0; i < nr_xcede_records; i++) {
205
		struct xcede_latency_record *record = &payload->records[i];
206
		u64 latency_ticks = be64_to_cpu(record->latency_ticks);
207
		u8 wake_on_irqs = record->wake_on_irqs;
208
		u8 hint = record->hint;
209

210
		pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
211
			i, hint, latency_ticks, wake_on_irqs);
212
	}
213

214
	return 0;
215
}
216

217
#define NR_DEDICATED_STATES	2 /* snooze, CEDE */
218
static u8 cede_latency_hint[NR_DEDICATED_STATES];
219

220
static __cpuidle
221
int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
222
			int index)
223
{
224
	u8 old_latency_hint;
225

226
	pseries_idle_prolog();
227
	get_lppaca()->donate_dedicated_cpu = 1;
228
	old_latency_hint = get_lppaca()->cede_latency_hint;
229
	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
230

231
	HMT_medium();
232
	check_and_cede_processor();
233

234
	raw_local_irq_disable();
235
	get_lppaca()->donate_dedicated_cpu = 0;
236
	get_lppaca()->cede_latency_hint = old_latency_hint;
237

238
	pseries_idle_epilog();
239

240
	return index;
241
}
242

243
static __cpuidle
244
int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
245
		     int index)
246
{
247

248
	pseries_idle_prolog();
249

250
	/*
251
	 * Yield the processor to the hypervisor.  We return if
252
	 * an external interrupt occurs (which are driven prior
253
	 * to returning here) or if a prod occurs from another
254
	 * processor. When returning here, external interrupts
255
	 * are enabled.
256
	 */
257
	check_and_cede_processor();
258

259
	raw_local_irq_disable();
260
	pseries_idle_epilog();
261

262
	return index;
263
}
264

265
/*
266
 * States for dedicated partition case.
267
 */
268
static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
269
	{ /* Snooze */
270
		.name = "snooze",
271
		.desc = "snooze",
272
		.exit_latency = 0,
273
		.target_residency = 0,
274
		.enter = &snooze_loop,
275
		.flags = CPUIDLE_FLAG_POLLING },
276
	{ /* CEDE */
277
		.name = "CEDE",
278
		.desc = "CEDE",
279
		.exit_latency = 10,
280
		.target_residency = 100,
281
		.enter = &dedicated_cede_loop },
282
};
283

284
/*
285
 * States for shared partition case.
286
 */
287
static struct cpuidle_state shared_states[] = {
288
	{ /* Snooze */
289
		.name = "snooze",
290
		.desc = "snooze",
291
		.exit_latency = 0,
292
		.target_residency = 0,
293
		.enter = &snooze_loop,
294
		.flags = CPUIDLE_FLAG_POLLING },
295
	{ /* Shared Cede */
296
		.name = "Shared Cede",
297
		.desc = "Shared Cede",
298
		.exit_latency = 10,
299
		.target_residency = 100,
300
		.enter = &shared_cede_loop },
301
};
302

303
static int pseries_cpuidle_cpu_online(unsigned int cpu)
304
{
305
	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
306

307
	if (dev && cpuidle_get_driver()) {
308
		cpuidle_pause_and_lock();
309
		cpuidle_enable_device(dev);
310
		cpuidle_resume_and_unlock();
311
	}
312
	return 0;
313
}
314

315
static int pseries_cpuidle_cpu_dead(unsigned int cpu)
316
{
317
	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
318

319
	if (dev && cpuidle_get_driver()) {
320
		cpuidle_pause_and_lock();
321
		cpuidle_disable_device(dev);
322
		cpuidle_resume_and_unlock();
323
	}
324
	return 0;
325
}
326

327
/*
328
 * pseries_cpuidle_driver_init()
329
 */
330
static int pseries_cpuidle_driver_init(void)
331
{
332
	int idle_state;
333
	struct cpuidle_driver *drv = &pseries_idle_driver;
334

335
	drv->state_count = 0;
336

337
	for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
338
		/* Is the state not enabled? */
339
		if (cpuidle_state_table[idle_state].enter == NULL)
340
			continue;
341

342
		drv->states[drv->state_count] =	/* structure copy */
343
			cpuidle_state_table[idle_state];
344

345
		drv->state_count += 1;
346
	}
347

348
	return 0;
349
}
350

351
static void __init fixup_cede0_latency(void)
352
{
353
	struct xcede_latency_payload *payload;
354
	u64 min_xcede_latency_us = UINT_MAX;
355
	int i;
356

357
	if (parse_cede_parameters())
358
		return;
359

360
	pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
361
		nr_xcede_records);
362

363
	payload = &xcede_latency_parameter.payload;
364

365
	/*
366
	 * The CEDE idle state maps to CEDE(0). While the hypervisor
367
	 * does not advertise CEDE(0) exit latency values, it does
368
	 * advertise the latency values of the extended CEDE states.
369
	 * We use the lowest advertised exit latency value as a proxy
370
	 * for the exit latency of CEDE(0).
371
	 */
372
	for (i = 0; i < nr_xcede_records; i++) {
373
		struct xcede_latency_record *record = &payload->records[i];
374
		u8 hint = record->hint;
375
		u64 latency_tb = be64_to_cpu(record->latency_ticks);
376
		u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
377

378
		/*
379
		 * We expect the exit latency of an extended CEDE
380
		 * state to be non-zero, it to since it takes at least
381
		 * a few nanoseconds to wakeup the idle CPU and
382
		 * dispatch the virtual processor into the Linux
383
		 * Guest.
384
		 *
385
		 * So we consider only non-zero value for performing
386
		 * the fixup of CEDE(0) latency.
387
		 */
388
		if (latency_us == 0) {
389
			pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
390
				i, hint);
391
			continue;
392
		}
393

394
		if (latency_us < min_xcede_latency_us)
395
			min_xcede_latency_us = latency_us;
396
	}
397

398
	if (min_xcede_latency_us != UINT_MAX) {
399
		dedicated_states[1].exit_latency = min_xcede_latency_us;
400
		dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
401
		pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
402
			min_xcede_latency_us);
403
	}
404

405
}
406

407
/*
408
 * pseries_idle_probe()
409
 * Choose state table for shared versus dedicated partition
410
 */
411
static int __init pseries_idle_probe(void)
412
{
413

414
	if (cpuidle_disable != IDLE_NO_OVERRIDE)
415
		return -ENODEV;
416

417
	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
418
		if (lppaca_shared_proc()) {
419
			cpuidle_state_table = shared_states;
420
			max_idle_state = ARRAY_SIZE(shared_states);
421
		} else {
422
			/*
423
			 * Use firmware provided latency values
424
			 * starting with POWER10 platforms. In the
425
			 * case that we are running on a POWER10
426
			 * platform but in an earlier compat mode, we
427
			 * can still use the firmware provided values.
428
			 *
429
			 * However, on platforms prior to POWER10, we
430
			 * cannot rely on the accuracy of the firmware
431
			 * provided latency values. On such platforms,
432
			 * go with the conservative default estimate
433
			 * of 10us.
434
			 */
435
			if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
436
				fixup_cede0_latency();
437
			cpuidle_state_table = dedicated_states;
438
			max_idle_state = NR_DEDICATED_STATES;
439
		}
440
	} else
441
		return -ENODEV;
442

443
	if (max_idle_state > 1) {
444
		snooze_timeout_en = true;
445
		snooze_timeout = cpuidle_state_table[1].target_residency *
446
				 tb_ticks_per_usec;
447
	}
448
	return 0;
449
}
450

451
static int __init pseries_processor_idle_init(void)
452
{
453
	int retval;
454

455
	retval = pseries_idle_probe();
456
	if (retval)
457
		return retval;
458

459
	pseries_cpuidle_driver_init();
460
	retval = cpuidle_register(&pseries_idle_driver, NULL);
461
	if (retval) {
462
		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
463
		return retval;
464
	}
465

466
	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
467
					   "cpuidle/pseries:online",
468
					   pseries_cpuidle_cpu_online, NULL);
469
	WARN_ON(retval < 0);
470
	retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
471
					   "cpuidle/pseries:DEAD", NULL,
472
					   pseries_cpuidle_cpu_dead);
473
	WARN_ON(retval < 0);
474
	printk(KERN_DEBUG "pseries_idle_driver registered\n");
475
	return 0;
476
}
477

478
device_initcall(pseries_processor_idle_init);
479

480
Product

Resources

Company