Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/cpuidle/governors/teo.c
51381 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Timer events oriented CPU idle governor
4
*
5
* Copyright (C) 2018 - 2021 Intel Corporation
6
* Author: Rafael J. Wysocki <[email protected]>
7
*/
8
9
/**
10
* DOC: teo-description
11
*
12
* The idea of this governor is based on the observation that on many systems
13
* timer interrupts are two or more orders of magnitude more frequent than any
14
* other interrupt types, so they are likely to dominate CPU wakeup patterns.
15
* Moreover, in principle, the time when the next timer event is going to occur
16
* can be determined at the idle state selection time, although doing that may
17
* be costly, so it can be regarded as the most reliable source of information
18
* for idle state selection.
19
*
20
* Of course, non-timer wakeup sources are more important in some use cases,
21
* but even then it is generally unnecessary to consider idle duration values
22
* greater than the time till the next timer event, referred as the sleep
23
* length in what follows, because the closest timer will ultimately wake up the
24
* CPU anyway unless it is woken up earlier.
25
*
26
* However, since obtaining the sleep length may be costly, the governor first
27
* checks if it can select a shallow idle state using wakeup pattern information
28
* from recent times, in which case it can do without knowing the sleep length
29
* at all. For this purpose, it counts CPU wakeup events and looks for an idle
30
* state whose target residency has not exceeded the idle duration (measured
31
* after wakeup) in the majority of relevant recent cases. If the target
32
* residency of that state is small enough, it may be used right away and the
33
* sleep length need not be determined.
34
*
35
* The computations carried out by this governor are based on using bins whose
36
* boundaries are aligned with the target residency parameter values of the CPU
37
* idle states provided by the %CPUIdle driver in the ascending order. That is,
38
* the first bin spans from 0 up to, but not including, the target residency of
39
* the second idle state (idle state 1), the second bin spans from the target
40
* residency of idle state 1 up to, but not including, the target residency of
41
* idle state 2, the third bin spans from the target residency of idle state 2
42
* up to, but not including, the target residency of idle state 3 and so on.
43
* The last bin spans from the target residency of the deepest idle state
44
* supplied by the driver to infinity.
45
*
46
* Two metrics called "hits" and "intercepts" are associated with each bin.
47
* They are updated every time before selecting an idle state for the given CPU
48
* in accordance with what happened last time.
49
*
50
* The "hits" metric reflects the relative frequency of situations in which the
51
* sleep length and the idle duration measured after CPU wakeup are close enough
52
* (that is, the CPU appears to wake up "on time" relative to the sleep length).
53
* In turn, the "intercepts" metric reflects the relative frequency of non-timer
54
* wakeup events for which the measured idle duration is significantly different
55
* from the sleep length (these events are also referred to as "intercepts"
56
* below).
57
*
58
* The governor also counts "intercepts" with the measured idle duration below
59
* the tick period length and uses this information when deciding whether or not
60
* to stop the scheduler tick.
61
*
62
* In order to select an idle state for a CPU, the governor takes the following
63
* steps (modulo the possible latency constraint that must be taken into account
64
* too):
65
*
66
* 1. Find the deepest enabled CPU idle state (the candidate idle state) and
67
* compute 2 sums as follows:
68
*
69
* - The sum of the "hits" metric for all of the idle states shallower than
70
* the candidate one (it represents the cases in which the CPU was likely
71
* woken up by a timer).
72
*
73
* - The sum of the "intercepts" metric for all of the idle states shallower
74
* than the candidate one (it represents the cases in which the CPU was
75
* likely woken up by a non-timer wakeup source).
76
*
77
* Also find the idle state with the maximum intercepts metric (if there are
78
* multiple states with the maximum intercepts metric, choose the one with
79
* the highest index).
80
*
81
* 2. If the second sum computed in step 1 is greater than a half of the sum of
82
* both metrics for the candidate state bin and all subsequent bins (if any),
83
* a shallower idle state is likely to be more suitable, so look for it.
84
*
85
* - Traverse the enabled idle states shallower than the candidate one in the
86
* descending order, starting at the state with the maximum intercepts
87
* metric found in step 1.
88
*
89
* - For each of them compute the sum of the "intercepts" metrics over all
90
* of the idle states between it and the candidate one (including the
91
* former and excluding the latter).
92
*
93
* - If this sum is greater than a half of the second sum computed in step 1,
94
* use the given idle state as the new candidate one.
95
*
96
* 3. If the current candidate state is state 0 or its target residency is short
97
* enough, return it and prevent the scheduler tick from being stopped.
98
*
99
* 4. Obtain the sleep length value and check if it is below the target
100
* residency of the current candidate state, in which case a new shallower
101
* candidate state needs to be found, so look for it.
102
*/
103
104
#include <linux/cpuidle.h>
105
#include <linux/jiffies.h>
106
#include <linux/kernel.h>
107
#include <linux/sched/clock.h>
108
#include <linux/tick.h>
109
110
#include "gov.h"
111
112
/*
113
* Idle state exit latency threshold used for deciding whether or not to check
114
* the time till the closest expected timer event.
115
*/
116
#define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2)
117
118
/*
119
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
120
* is used for decreasing metrics on a regular basis.
121
*/
122
#define PULSE 1024
123
#define DECAY_SHIFT 3
124
125
/**
126
* struct teo_bin - Metrics used by the TEO cpuidle governor.
127
* @intercepts: The "intercepts" metric.
128
* @hits: The "hits" metric.
129
*/
130
struct teo_bin {
131
unsigned int intercepts;
132
unsigned int hits;
133
};
134
135
/**
136
* struct teo_cpu - CPU data used by the TEO cpuidle governor.
137
* @sleep_length_ns: Time till the closest timer event (at the selection time).
138
* @state_bins: Idle state data bins for this CPU.
139
* @total: Grand total of the "intercepts" and "hits" metrics for all bins.
140
* @total_tick: Wakeups by the scheduler tick.
141
* @tick_intercepts: "Intercepts" before TICK_NSEC.
142
* @short_idles: Wakeups after short idle periods.
143
* @tick_wakeup: Set if the last wakeup was by the scheduler tick.
144
*/
145
struct teo_cpu {
146
s64 sleep_length_ns;
147
struct teo_bin state_bins[CPUIDLE_STATE_MAX];
148
unsigned int total;
149
unsigned int total_tick;
150
unsigned int tick_intercepts;
151
unsigned int short_idles;
152
bool tick_wakeup;
153
};
154
155
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
156
157
static void teo_decay(unsigned int *metric)
158
{
159
unsigned int delta = *metric >> DECAY_SHIFT;
160
161
if (delta)
162
*metric -= delta;
163
else
164
*metric = 0;
165
}
166
167
/**
168
* teo_update - Update CPU metrics after wakeup.
169
* @drv: cpuidle driver containing state data.
170
* @dev: Target CPU.
171
*/
172
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
173
{
174
s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
175
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
176
int i, idx_timer = 0, idx_duration = 0;
177
s64 target_residency_ns, measured_ns;
178
unsigned int total = 0;
179
180
teo_decay(&cpu_data->short_idles);
181
182
if (dev->poll_time_limit) {
183
dev->poll_time_limit = false;
184
/*
185
* Polling state timeout has triggered, so assume that this
186
* might have been a long sleep.
187
*/
188
measured_ns = S64_MAX;
189
} else {
190
measured_ns = dev->last_residency_ns;
191
/*
192
* The delay between the wakeup and the first instruction
193
* executed by the CPU is not likely to be worst-case every
194
* time, so take 1/2 of the exit latency as a very rough
195
* approximation of the average of it.
196
*/
197
if (measured_ns >= lat_ns) {
198
measured_ns -= lat_ns / 2;
199
if (measured_ns < RESIDENCY_THRESHOLD_NS)
200
cpu_data->short_idles += PULSE;
201
} else {
202
measured_ns /= 2;
203
cpu_data->short_idles += PULSE;
204
}
205
}
206
207
/*
208
* Decay the "hits" and "intercepts" metrics for all of the bins and
209
* find the bins that the sleep length and the measured idle duration
210
* fall into.
211
*/
212
for (i = 0; i < drv->state_count; i++) {
213
struct teo_bin *bin = &cpu_data->state_bins[i];
214
215
teo_decay(&bin->hits);
216
total += bin->hits;
217
teo_decay(&bin->intercepts);
218
total += bin->intercepts;
219
220
target_residency_ns = drv->states[i].target_residency_ns;
221
222
if (target_residency_ns <= cpu_data->sleep_length_ns) {
223
idx_timer = i;
224
if (target_residency_ns <= measured_ns)
225
idx_duration = i;
226
}
227
}
228
229
cpu_data->total = total + PULSE;
230
231
teo_decay(&cpu_data->tick_intercepts);
232
233
teo_decay(&cpu_data->total_tick);
234
if (cpu_data->tick_wakeup) {
235
cpu_data->total_tick += PULSE;
236
/*
237
* If tick wakeups dominate the wakeup pattern, count this one
238
* as a hit on the deepest available idle state to increase the
239
* likelihood of stopping the tick.
240
*/
241
if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
242
cpu_data->state_bins[drv->state_count-1].hits += PULSE;
243
return;
244
}
245
/*
246
* If intercepts within the tick period range are not frequent
247
* enough, count this wakeup as a hit, since it is likely that
248
* the tick has woken up the CPU because an expected intercept
249
* was not there. Otherwise, one of the intercepts may have
250
* been incidentally preceded by the tick wakeup.
251
*/
252
if (3 * cpu_data->tick_intercepts < 2 * total) {
253
cpu_data->state_bins[idx_timer].hits += PULSE;
254
return;
255
}
256
}
257
258
/*
259
* If the measured idle duration (adjusted for the entered state exit
260
* latency) falls into the same bin as the sleep length and the latter
261
* is less than the "raw" measured idle duration (so the wakeup appears
262
* to have occurred after the anticipated timer event), this is a "hit",
263
* so update the "hits" metric for that bin.
264
*
265
* Otherwise, update the "intercepts" metric for the bin fallen into by
266
* the measured idle duration.
267
*/
268
if (idx_timer == idx_duration &&
269
cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
270
cpu_data->state_bins[idx_timer].hits += PULSE;
271
} else {
272
cpu_data->state_bins[idx_duration].intercepts += PULSE;
273
if (measured_ns <= TICK_NSEC)
274
cpu_data->tick_intercepts += PULSE;
275
}
276
}
277
278
/**
279
* teo_find_shallower_state - Find shallower idle state matching given duration.
280
* @drv: cpuidle driver containing state data.
281
* @dev: Target CPU.
282
* @state_idx: Index of the capping idle state.
283
* @duration_ns: Idle duration value to match.
284
*/
285
static int teo_find_shallower_state(struct cpuidle_driver *drv,
286
struct cpuidle_device *dev, int state_idx,
287
s64 duration_ns)
288
{
289
int i;
290
291
for (i = state_idx - 1; i >= 0; i--) {
292
if (dev->states_usage[i].disable)
293
continue;
294
295
state_idx = i;
296
if (drv->states[i].target_residency_ns <= duration_ns)
297
break;
298
}
299
return state_idx;
300
}
301
302
/**
303
* teo_select - Selects the next idle state to enter.
304
* @drv: cpuidle driver containing state data.
305
* @dev: Target CPU.
306
* @stop_tick: Indication on whether or not to stop the scheduler tick.
307
*/
308
static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
309
bool *stop_tick)
310
{
311
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
312
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
313
ktime_t delta_tick = TICK_NSEC / 2;
314
unsigned int idx_intercept_sum = 0;
315
unsigned int intercept_sum = 0;
316
unsigned int intercept_max = 0;
317
unsigned int idx_hit_sum = 0;
318
unsigned int hit_sum = 0;
319
int intercept_max_idx = -1;
320
int constraint_idx = 0;
321
int idx0 = 0, idx = -1;
322
s64 duration_ns;
323
int i;
324
325
if (dev->last_state_idx >= 0) {
326
teo_update(drv, dev);
327
dev->last_state_idx = -1;
328
}
329
330
/*
331
* Set the sleep length to infinity in case the invocation of
332
* tick_nohz_get_sleep_length() below is skipped, in which case it won't
333
* be known whether or not the subsequent wakeup is caused by a timer.
334
* It is generally fine to count the wakeup as an intercept then, except
335
* for the cases when the CPU is mostly woken up by timers and there may
336
* be opportunities to ask for a deeper idle state when no imminent
337
* timers are scheduled which may be missed.
338
*/
339
cpu_data->sleep_length_ns = KTIME_MAX;
340
341
/* Check if there is any choice in the first place. */
342
if (drv->state_count < 2) {
343
idx = 0;
344
goto out_tick;
345
}
346
347
if (!dev->states_usage[0].disable)
348
idx = 0;
349
350
/*
351
* Compute the sums of metrics for early wakeup pattern detection and
352
* look for the state bin with the maximum intercepts metric below the
353
* deepest enabled one (if there are multiple states with the maximum
354
* intercepts metric, choose the one with the highest index).
355
*/
356
for (i = 1; i < drv->state_count; i++) {
357
struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
358
unsigned int prev_intercepts = prev_bin->intercepts;
359
struct cpuidle_state *s = &drv->states[i];
360
361
/*
362
* Update the sums of idle state metrics for all of the states
363
* shallower than the current one.
364
*/
365
hit_sum += prev_bin->hits;
366
intercept_sum += prev_intercepts;
367
/*
368
* Check if this is the bin with the maximum number of
369
* intercepts so far and in that case update the index of
370
* the state with the maximum intercepts metric.
371
*/
372
if (prev_intercepts >= intercept_max) {
373
intercept_max = prev_intercepts;
374
intercept_max_idx = i - 1;
375
}
376
377
if (dev->states_usage[i].disable)
378
continue;
379
380
if (idx < 0)
381
idx0 = i; /* first enabled state */
382
383
idx = i;
384
385
if (s->exit_latency_ns <= latency_req)
386
constraint_idx = i;
387
388
/* Save the sums for the current state. */
389
idx_intercept_sum = intercept_sum;
390
idx_hit_sum = hit_sum;
391
}
392
393
/* Avoid unnecessary overhead. */
394
if (idx < 0) {
395
idx = 0; /* No states enabled, must use 0. */
396
goto out_tick;
397
}
398
399
if (idx == idx0) {
400
/*
401
* Only one idle state is enabled, so use it, but do not
402
* allow the tick to be stopped it is shallow enough.
403
*/
404
duration_ns = drv->states[idx].target_residency_ns;
405
goto end;
406
}
407
408
/*
409
* If the sum of the intercepts metric for all of the idle states
410
* shallower than the current candidate one (idx) is greater than the
411
* sum of the intercepts and hits metrics for the candidate state and
412
* all of the deeper states, a shallower idle state is likely to be a
413
* better choice.
414
*/
415
if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
416
int min_idx = idx0;
417
418
if (tick_nohz_tick_stopped()) {
419
/*
420
* Look for the shallowest idle state below the current
421
* candidate one whose target residency is at least
422
* equal to the tick period length.
423
*/
424
while (min_idx < idx &&
425
drv->states[min_idx].target_residency_ns < TICK_NSEC)
426
min_idx++;
427
428
/*
429
* Avoid selecting a state with a lower index, but with
430
* the same target residency as the current candidate
431
* one.
432
*/
433
if (drv->states[min_idx].target_residency_ns ==
434
drv->states[idx].target_residency_ns)
435
goto constraint;
436
}
437
438
/*
439
* If the minimum state index is greater than or equal to the
440
* index of the state with the maximum intercepts metric and
441
* the corresponding state is enabled, there is no need to look
442
* at the deeper states.
443
*/
444
if (min_idx >= intercept_max_idx &&
445
!dev->states_usage[min_idx].disable) {
446
idx = min_idx;
447
goto constraint;
448
}
449
450
/*
451
* Look for the deepest enabled idle state, at most as deep as
452
* the one with the maximum intercepts metric, whose target
453
* residency had not been greater than the idle duration in over
454
* a half of the relevant cases in the past.
455
*
456
* Take the possible duration limitation present if the tick
457
* has been stopped already into account.
458
*/
459
for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
460
intercept_sum += cpu_data->state_bins[i].intercepts;
461
462
if (dev->states_usage[i].disable)
463
continue;
464
465
idx = i;
466
if (2 * intercept_sum > idx_intercept_sum &&
467
i <= intercept_max_idx)
468
break;
469
}
470
}
471
472
constraint:
473
/*
474
* If there is a latency constraint, it may be necessary to select an
475
* idle state shallower than the current candidate one.
476
*/
477
if (idx > constraint_idx)
478
idx = constraint_idx;
479
480
/*
481
* If either the candidate state is state 0 or its target residency is
482
* low enough, there is basically nothing more to do, but if the sleep
483
* length is not updated, the subsequent wakeup will be counted as an
484
* "intercept" which may be problematic in the cases when timer wakeups
485
* are dominant. Namely, it may effectively prevent deeper idle states
486
* from being selected at one point even if no imminent timers are
487
* scheduled.
488
*
489
* However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
490
* CPU are unlikely (user space has a default 50 us slack value for
491
* hrtimers and there are relatively few timers with a lower deadline
492
* value in the kernel), and even if they did happen, the potential
493
* benefit from using a deep idle state in that case would be
494
* questionable anyway for latency reasons. Thus if the measured idle
495
* duration falls into that range in the majority of cases, assume
496
* non-timer wakeups to be dominant and skip updating the sleep length
497
* to reduce latency.
498
*
499
* Also, if the latency constraint is sufficiently low, it will force
500
* shallow idle states regardless of the wakeup type, so the sleep
501
* length need not be known in that case.
502
*/
503
if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
504
(2 * cpu_data->short_idles >= cpu_data->total ||
505
latency_req < LATENCY_THRESHOLD_NS))
506
goto out_tick;
507
508
duration_ns = tick_nohz_get_sleep_length(&delta_tick);
509
cpu_data->sleep_length_ns = duration_ns;
510
511
if (!idx)
512
goto out_tick;
513
514
/*
515
* If the closest expected timer is before the target residency of the
516
* candidate state, a shallower one needs to be found.
517
*/
518
if (drv->states[idx].target_residency_ns > duration_ns)
519
idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
520
521
/*
522
* If the selected state's target residency is below the tick length
523
* and intercepts occurring before the tick length are the majority of
524
* total wakeup events, do not stop the tick.
525
*/
526
if (drv->states[idx].target_residency_ns < TICK_NSEC &&
527
3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
528
duration_ns = TICK_NSEC / 2;
529
530
end:
531
/*
532
* Allow the tick to be stopped unless the selected state is a polling
533
* one or the expected idle duration is shorter than the tick period
534
* length.
535
*/
536
if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
537
duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
538
return idx;
539
540
/*
541
* The tick is not going to be stopped, so if the target residency of
542
* the state to be returned is not within the time till the closest
543
* timer including the tick, try to correct that.
544
*/
545
if (idx > idx0 &&
546
drv->states[idx].target_residency_ns > delta_tick)
547
idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
548
549
out_tick:
550
*stop_tick = false;
551
return idx;
552
}
553
554
/**
555
* teo_reflect - Note that governor data for the CPU need to be updated.
556
* @dev: Target CPU.
557
* @state: Entered state.
558
*/
559
static void teo_reflect(struct cpuidle_device *dev, int state)
560
{
561
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
562
563
cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
564
565
dev->last_state_idx = state;
566
}
567
568
/**
569
* teo_enable_device - Initialize the governor's data for the target CPU.
570
* @drv: cpuidle driver (not used).
571
* @dev: Target CPU.
572
*/
573
static int teo_enable_device(struct cpuidle_driver *drv,
574
struct cpuidle_device *dev)
575
{
576
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
577
578
memset(cpu_data, 0, sizeof(*cpu_data));
579
580
return 0;
581
}
582
583
static struct cpuidle_governor teo_governor = {
584
.name = "teo",
585
.rating = 19,
586
.enable = teo_enable_device,
587
.select = teo_select,
588
.reflect = teo_reflect,
589
};
590
591
static int __init teo_governor_init(void)
592
{
593
return cpuidle_register_governor(&teo_governor);
594
}
595
596
postcore_initcall(teo_governor_init);
597
598