CoCalc -- teo.c

GitHub Repository: torvalds/linux
Path: blob/master/drivers/cpuidle/governors/teo.c
²⁶²⁸² views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Timer events oriented CPU idle governor
4
 *
5
 * Copyright (C) 2018 - 2021 Intel Corporation
6
 * Author: Rafael J. Wysocki <[email protected]>
7
 */
8

9
/**
10
 * DOC: teo-description
11
 *
12
 * The idea of this governor is based on the observation that on many systems
13
 * timer interrupts are two or more orders of magnitude more frequent than any
14
 * other interrupt types, so they are likely to dominate CPU wakeup patterns.
15
 * Moreover, in principle, the time when the next timer event is going to occur
16
 * can be determined at the idle state selection time, although doing that may
17
 * be costly, so it can be regarded as the most reliable source of information
18
 * for idle state selection.
19
 *
20
 * Of course, non-timer wakeup sources are more important in some use cases,
21
 * but even then it is generally unnecessary to consider idle duration values
22
 * greater than the time till the next timer event, referred as the sleep
23
 * length in what follows, because the closest timer will ultimately wake up the
24
 * CPU anyway unless it is woken up earlier.
25
 *
26
 * However, since obtaining the sleep length may be costly, the governor first
27
 * checks if it can select a shallow idle state using wakeup pattern information
28
 * from recent times, in which case it can do without knowing the sleep length
29
 * at all.  For this purpose, it counts CPU wakeup events and looks for an idle
30
 * state whose target residency has not exceeded the idle duration (measured
31
 * after wakeup) in the majority of relevant recent cases.  If the target
32
 * residency of that state is small enough, it may be used right away and the
33
 * sleep length need not be determined.
34
 *
35
 * The computations carried out by this governor are based on using bins whose
36
 * boundaries are aligned with the target residency parameter values of the CPU
37
 * idle states provided by the %CPUIdle driver in the ascending order.  That is,
38
 * the first bin spans from 0 up to, but not including, the target residency of
39
 * the second idle state (idle state 1), the second bin spans from the target
40
 * residency of idle state 1 up to, but not including, the target residency of
41
 * idle state 2, the third bin spans from the target residency of idle state 2
42
 * up to, but not including, the target residency of idle state 3 and so on.
43
 * The last bin spans from the target residency of the deepest idle state
44
 * supplied by the driver to infinity.
45
 *
46
 * Two metrics called "hits" and "intercepts" are associated with each bin.
47
 * They are updated every time before selecting an idle state for the given CPU
48
 * in accordance with what happened last time.
49
 *
50
 * The "hits" metric reflects the relative frequency of situations in which the
51
 * sleep length and the idle duration measured after CPU wakeup fall into the
52
 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
53
 * length).  In turn, the "intercepts" metric reflects the relative frequency of
54
 * non-timer wakeup events for which the measured idle duration falls into a bin
55
 * that corresponds to an idle state shallower than the one whose bin is fallen
56
 * into by the sleep length (these events are also referred to as "intercepts"
57
 * below).
58
 *
59
 * The governor also counts "intercepts" with the measured idle duration below
60
 * the tick period length and uses this information when deciding whether or not
61
 * to stop the scheduler tick.
62
 *
63
 * In order to select an idle state for a CPU, the governor takes the following
64
 * steps (modulo the possible latency constraint that must be taken into account
65
 * too):
66
 *
67
 * 1. Find the deepest enabled CPU idle state (the candidate idle state) and
68
 *    compute 2 sums as follows:
69
 *
70
 *    - The sum of the "hits" metric for all of the idle states shallower than
71
 *      the candidate one (it represents the cases in which the CPU was likely
72
 *      woken up by a timer).
73
 *
74
 *    - The sum of the "intercepts" metric for all of the idle states shallower
75
 *      than the candidate one (it represents the cases in which the CPU was
76
 *      likely woken up by a non-timer wakeup source).
77
 *
78
 * 2. If the second sum computed in step 1 is greater than a half of the sum of
79
 *    both metrics for the candidate state bin and all subsequent bins(if any),
80
 *    a shallower idle state is likely to be more suitable, so look for it.
81
 *
82
 *    - Traverse the enabled idle states shallower than the candidate one in the
83
 *      descending order.
84
 *
85
 *    - For each of them compute the sum of the "intercepts" metrics over all
86
 *      of the idle states between it and the candidate one (including the
87
 *      former and excluding the latter).
88
 *
89
 *    - If this sum is greater than a half of the second sum computed in step 1,
90
 *      use the given idle state as the new candidate one.
91
 *
92
 * 3. If the current candidate state is state 0 or its target residency is short
93
 *    enough, return it and prevent the scheduler tick from being stopped.
94
 *
95
 * 4. Obtain the sleep length value and check if it is below the target
96
 *    residency of the current candidate state, in which case a new shallower
97
 *    candidate state needs to be found, so look for it.
98
 */
99

100
#include <linux/cpuidle.h>
101
#include <linux/jiffies.h>
102
#include <linux/kernel.h>
103
#include <linux/sched/clock.h>
104
#include <linux/tick.h>
105

106
#include "gov.h"
107

108
/*
109
 * Idle state exit latency threshold used for deciding whether or not to check
110
 * the time till the closest expected timer event.
111
 */
112
#define LATENCY_THRESHOLD_NS	(RESIDENCY_THRESHOLD_NS / 2)
113

114
/*
115
 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
116
 * is used for decreasing metrics on a regular basis.
117
 */
118
#define PULSE		1024
119
#define DECAY_SHIFT	3
120

121
/**
122
 * struct teo_bin - Metrics used by the TEO cpuidle governor.
123
 * @intercepts: The "intercepts" metric.
124
 * @hits: The "hits" metric.
125
 */
126
struct teo_bin {
127
	unsigned int intercepts;
128
	unsigned int hits;
129
};
130

131
/**
132
 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
133
 * @sleep_length_ns: Time till the closest timer event (at the selection time).
134
 * @state_bins: Idle state data bins for this CPU.
135
 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
136
 * @tick_intercepts: "Intercepts" before TICK_NSEC.
137
 * @short_idles: Wakeups after short idle periods.
138
 * @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
139
 */
140
struct teo_cpu {
141
	s64 sleep_length_ns;
142
	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
143
	unsigned int total;
144
	unsigned int tick_intercepts;
145
	unsigned int short_idles;
146
	bool artificial_wakeup;
147
};
148

149
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
150

151
/**
152
 * teo_update - Update CPU metrics after wakeup.
153
 * @drv: cpuidle driver containing state data.
154
 * @dev: Target CPU.
155
 */
156
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
157
{
158
	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
159
	int i, idx_timer = 0, idx_duration = 0;
160
	s64 target_residency_ns;
161
	u64 measured_ns;
162

163
	cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
164

165
	if (cpu_data->artificial_wakeup) {
166
		/*
167
		 * If one of the safety nets has triggered, assume that this
168
		 * might have been a long sleep.
169
		 */
170
		measured_ns = U64_MAX;
171
	} else {
172
		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
173

174
		measured_ns = dev->last_residency_ns;
175
		/*
176
		 * The delay between the wakeup and the first instruction
177
		 * executed by the CPU is not likely to be worst-case every
178
		 * time, so take 1/2 of the exit latency as a very rough
179
		 * approximation of the average of it.
180
		 */
181
		if (measured_ns >= lat_ns) {
182
			measured_ns -= lat_ns / 2;
183
			if (measured_ns < RESIDENCY_THRESHOLD_NS)
184
				cpu_data->short_idles += PULSE;
185
		} else {
186
			measured_ns /= 2;
187
			cpu_data->short_idles += PULSE;
188
		}
189
	}
190

191
	/*
192
	 * Decay the "hits" and "intercepts" metrics for all of the bins and
193
	 * find the bins that the sleep length and the measured idle duration
194
	 * fall into.
195
	 */
196
	for (i = 0; i < drv->state_count; i++) {
197
		struct teo_bin *bin = &cpu_data->state_bins[i];
198

199
		bin->hits -= bin->hits >> DECAY_SHIFT;
200
		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
201

202
		target_residency_ns = drv->states[i].target_residency_ns;
203

204
		if (target_residency_ns <= cpu_data->sleep_length_ns) {
205
			idx_timer = i;
206
			if (target_residency_ns <= measured_ns)
207
				idx_duration = i;
208
		}
209
	}
210

211
	cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
212
	/*
213
	 * If the measured idle duration falls into the same bin as the sleep
214
	 * length, this is a "hit", so update the "hits" metric for that bin.
215
	 * Otherwise, update the "intercepts" metric for the bin fallen into by
216
	 * the measured idle duration.
217
	 */
218
	if (idx_timer == idx_duration) {
219
		cpu_data->state_bins[idx_timer].hits += PULSE;
220
	} else {
221
		cpu_data->state_bins[idx_duration].intercepts += PULSE;
222
		if (TICK_NSEC <= measured_ns)
223
			cpu_data->tick_intercepts += PULSE;
224
	}
225

226
	cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
227
	cpu_data->total += PULSE;
228
}
229

230
static bool teo_state_ok(int i, struct cpuidle_driver *drv)
231
{
232
	return !tick_nohz_tick_stopped() ||
233
		drv->states[i].target_residency_ns >= TICK_NSEC;
234
}
235

236
/**
237
 * teo_find_shallower_state - Find shallower idle state matching given duration.
238
 * @drv: cpuidle driver containing state data.
239
 * @dev: Target CPU.
240
 * @state_idx: Index of the capping idle state.
241
 * @duration_ns: Idle duration value to match.
242
 * @no_poll: Don't consider polling states.
243
 */
244
static int teo_find_shallower_state(struct cpuidle_driver *drv,
245
				    struct cpuidle_device *dev, int state_idx,
246
				    s64 duration_ns, bool no_poll)
247
{
248
	int i;
249

250
	for (i = state_idx - 1; i >= 0; i--) {
251
		if (dev->states_usage[i].disable ||
252
				(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
253
			continue;
254

255
		state_idx = i;
256
		if (drv->states[i].target_residency_ns <= duration_ns)
257
			break;
258
	}
259
	return state_idx;
260
}
261

262
/**
263
 * teo_select - Selects the next idle state to enter.
264
 * @drv: cpuidle driver containing state data.
265
 * @dev: Target CPU.
266
 * @stop_tick: Indication on whether or not to stop the scheduler tick.
267
 */
268
static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
269
		      bool *stop_tick)
270
{
271
	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
272
	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
273
	ktime_t delta_tick = TICK_NSEC / 2;
274
	unsigned int idx_intercept_sum = 0;
275
	unsigned int intercept_sum = 0;
276
	unsigned int idx_hit_sum = 0;
277
	unsigned int hit_sum = 0;
278
	int constraint_idx = 0;
279
	int idx0 = 0, idx = -1;
280
	s64 duration_ns;
281
	int i;
282

283
	if (dev->last_state_idx >= 0) {
284
		teo_update(drv, dev);
285
		dev->last_state_idx = -1;
286
	}
287

288
	/*
289
	 * Set the sleep length to infinity in case the invocation of
290
	 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
291
	 * be known whether or not the subsequent wakeup is caused by a timer.
292
	 * It is generally fine to count the wakeup as an intercept then, except
293
	 * for the cases when the CPU is mostly woken up by timers and there may
294
	 * be opportunities to ask for a deeper idle state when no imminent
295
	 * timers are scheduled which may be missed.
296
	 */
297
	cpu_data->sleep_length_ns = KTIME_MAX;
298

299
	/* Check if there is any choice in the first place. */
300
	if (drv->state_count < 2) {
301
		idx = 0;
302
		goto out_tick;
303
	}
304

305
	if (!dev->states_usage[0].disable)
306
		idx = 0;
307

308
	/* Compute the sums of metrics for early wakeup pattern detection. */
309
	for (i = 1; i < drv->state_count; i++) {
310
		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
311
		struct cpuidle_state *s = &drv->states[i];
312

313
		/*
314
		 * Update the sums of idle state metrics for all of the states
315
		 * shallower than the current one.
316
		 */
317
		intercept_sum += prev_bin->intercepts;
318
		hit_sum += prev_bin->hits;
319

320
		if (dev->states_usage[i].disable)
321
			continue;
322

323
		if (idx < 0)
324
			idx0 = i; /* first enabled state */
325

326
		idx = i;
327

328
		if (s->exit_latency_ns <= latency_req)
329
			constraint_idx = i;
330

331
		/* Save the sums for the current state. */
332
		idx_intercept_sum = intercept_sum;
333
		idx_hit_sum = hit_sum;
334
	}
335

336
	/* Avoid unnecessary overhead. */
337
	if (idx < 0) {
338
		idx = 0; /* No states enabled, must use 0. */
339
		goto out_tick;
340
	}
341

342
	if (idx == idx0) {
343
		/*
344
		 * Only one idle state is enabled, so use it, but do not
345
		 * allow the tick to be stopped it is shallow enough.
346
		 */
347
		duration_ns = drv->states[idx].target_residency_ns;
348
		goto end;
349
	}
350

351
	/*
352
	 * If the sum of the intercepts metric for all of the idle states
353
	 * shallower than the current candidate one (idx) is greater than the
354
	 * sum of the intercepts and hits metrics for the candidate state and
355
	 * all of the deeper states, a shallower idle state is likely to be a
356
	 * better choice.
357
	 */
358
	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
359
		int first_suitable_idx = idx;
360

361
		/*
362
		 * Look for the deepest idle state whose target residency had
363
		 * not exceeded the idle duration in over a half of the relevant
364
		 * cases in the past.
365
		 *
366
		 * Take the possible duration limitation present if the tick
367
		 * has been stopped already into account.
368
		 */
369
		intercept_sum = 0;
370

371
		for (i = idx - 1; i >= 0; i--) {
372
			struct teo_bin *bin = &cpu_data->state_bins[i];
373

374
			intercept_sum += bin->intercepts;
375

376
			if (2 * intercept_sum > idx_intercept_sum) {
377
				/*
378
				 * Use the current state unless it is too
379
				 * shallow or disabled, in which case take the
380
				 * first enabled state that is deep enough.
381
				 */
382
				if (teo_state_ok(i, drv) &&
383
				    !dev->states_usage[i].disable) {
384
					idx = i;
385
					break;
386
				}
387
				idx = first_suitable_idx;
388
				break;
389
			}
390

391
			if (dev->states_usage[i].disable)
392
				continue;
393

394
			if (teo_state_ok(i, drv)) {
395
				/*
396
				 * The current state is deep enough, but still
397
				 * there may be a better one.
398
				 */
399
				first_suitable_idx = i;
400
				continue;
401
			}
402

403
			/*
404
			 * The current state is too shallow, so if no suitable
405
			 * states other than the initial candidate have been
406
			 * found, give up (the remaining states to check are
407
			 * shallower still), but otherwise the first suitable
408
			 * state other than the initial candidate may turn out
409
			 * to be preferable.
410
			 */
411
			if (first_suitable_idx == idx)
412
				break;
413
		}
414
	}
415

416
	/*
417
	 * If there is a latency constraint, it may be necessary to select an
418
	 * idle state shallower than the current candidate one.
419
	 */
420
	if (idx > constraint_idx)
421
		idx = constraint_idx;
422

423
	/*
424
	 * If either the candidate state is state 0 or its target residency is
425
	 * low enough, there is basically nothing more to do, but if the sleep
426
	 * length is not updated, the subsequent wakeup will be counted as an
427
	 * "intercept" which may be problematic in the cases when timer wakeups
428
	 * are dominant.  Namely, it may effectively prevent deeper idle states
429
	 * from being selected at one point even if no imminent timers are
430
	 * scheduled.
431
	 *
432
	 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
433
	 * CPU are unlikely (user space has a default 50 us slack value for
434
	 * hrtimers and there are relatively few timers with a lower deadline
435
	 * value in the kernel), and even if they did happen, the potential
436
	 * benefit from using a deep idle state in that case would be
437
	 * questionable anyway for latency reasons.  Thus if the measured idle
438
	 * duration falls into that range in the majority of cases, assume
439
	 * non-timer wakeups to be dominant and skip updating the sleep length
440
	 * to reduce latency.
441
	 *
442
	 * Also, if the latency constraint is sufficiently low, it will force
443
	 * shallow idle states regardless of the wakeup type, so the sleep
444
	 * length need not be known in that case.
445
	 */
446
	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
447
	    (2 * cpu_data->short_idles >= cpu_data->total ||
448
	     latency_req < LATENCY_THRESHOLD_NS))
449
		goto out_tick;
450

451
	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
452
	cpu_data->sleep_length_ns = duration_ns;
453

454
	if (!idx)
455
		goto out_tick;
456

457
	/*
458
	 * If the closest expected timer is before the target residency of the
459
	 * candidate state, a shallower one needs to be found.
460
	 */
461
	if (drv->states[idx].target_residency_ns > duration_ns) {
462
		i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
463
		if (teo_state_ok(i, drv))
464
			idx = i;
465
	}
466

467
	/*
468
	 * If the selected state's target residency is below the tick length
469
	 * and intercepts occurring before the tick length are the majority of
470
	 * total wakeup events, do not stop the tick.
471
	 */
472
	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
473
	    cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
474
		duration_ns = TICK_NSEC / 2;
475

476
end:
477
	/*
478
	 * Allow the tick to be stopped unless the selected state is a polling
479
	 * one or the expected idle duration is shorter than the tick period
480
	 * length.
481
	 */
482
	if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
483
	    duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
484
		return idx;
485

486
	/*
487
	 * The tick is not going to be stopped, so if the target residency of
488
	 * the state to be returned is not within the time till the closest
489
	 * timer including the tick, try to correct that.
490
	 */
491
	if (idx > idx0 &&
492
	    drv->states[idx].target_residency_ns > delta_tick)
493
		idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
494

495
out_tick:
496
	*stop_tick = false;
497
	return idx;
498
}
499

500
/**
501
 * teo_reflect - Note that governor data for the CPU need to be updated.
502
 * @dev: Target CPU.
503
 * @state: Entered state.
504
 */
505
static void teo_reflect(struct cpuidle_device *dev, int state)
506
{
507
	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
508

509
	dev->last_state_idx = state;
510
	if (dev->poll_time_limit ||
511
	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
512
		/*
513
		 * The wakeup was not "genuine", but triggered by one of the
514
		 * safety nets.
515
		 */
516
		dev->poll_time_limit = false;
517
		cpu_data->artificial_wakeup = true;
518
	} else {
519
		cpu_data->artificial_wakeup = false;
520
	}
521
}
522

523
/**
524
 * teo_enable_device - Initialize the governor's data for the target CPU.
525
 * @drv: cpuidle driver (not used).
526
 * @dev: Target CPU.
527
 */
528
static int teo_enable_device(struct cpuidle_driver *drv,
529
			     struct cpuidle_device *dev)
530
{
531
	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
532

533
	memset(cpu_data, 0, sizeof(*cpu_data));
534

535
	return 0;
536
}
537

538
static struct cpuidle_governor teo_governor = {
539
	.name =		"teo",
540
	.rating =	19,
541
	.enable =	teo_enable_device,
542
	.select =	teo_select,
543
	.reflect =	teo_reflect,
544
};
545

546
static int __init teo_governor_init(void)
547
{
548
	return cpuidle_register_governor(&teo_governor);
549
}
550

551
postcore_initcall(teo_governor_init);
552

553
Product

Resources

Company