Path: blob/master/drivers/cpuidle/governors/menu.c
15111 views
/*1* menu.c - the menu idle governor2*3* Copyright (C) 2006-2007 Adam Belay <[email protected]>4* Copyright (C) 2009 Intel Corporation5* Author:6* Arjan van de Ven <[email protected]>7*8* This code is licenced under the GPL version 2 as described9* in the COPYING file that acompanies the Linux Kernel.10*/1112#include <linux/kernel.h>13#include <linux/cpuidle.h>14#include <linux/pm_qos_params.h>15#include <linux/time.h>16#include <linux/ktime.h>17#include <linux/hrtimer.h>18#include <linux/tick.h>19#include <linux/sched.h>20#include <linux/math64.h>2122#define BUCKETS 1223#define INTERVALS 824#define RESOLUTION 102425#define DECAY 826#define MAX_INTERESTING 5000027#define STDDEV_THRESH 400282930/*31* Concepts and ideas behind the menu governor32*33* For the menu governor, there are 3 decision factors for picking a C34* state:35* 1) Energy break even point36* 2) Performance impact37* 3) Latency tolerance (from pmqos infrastructure)38* These these three factors are treated independently.39*40* Energy break even point41* -----------------------42* C state entry and exit have an energy cost, and a certain amount of time in43* the C state is required to actually break even on this cost. CPUIDLE44* provides us this duration in the "target_residency" field. So all that we45* need is a good prediction of how long we'll be idle. Like the traditional46* menu governor, we start with the actual known "next timer event" time.47*48* Since there are other source of wakeups (interrupts for example) than49* the next timer event, this estimation is rather optimistic. To get a50* more realistic estimate, a correction factor is applied to the estimate,51* that is based on historic behavior. For example, if in the past the actual52* duration always was 50% of the next timer tick, the correction factor will53* be 0.5.54*55* menu uses a running average for this correction factor, however it uses a56* set of factors, not just a single factor. This stems from the realization57* that the ratio is dependent on the order of magnitude of the expected58* duration; if we expect 500 milliseconds of idle time the likelihood of59* getting an interrupt very early is much higher than if we expect 50 micro60* seconds of idle time. A second independent factor that has big impact on61* the actual factor is if there is (disk) IO outstanding or not.62* (as a special twist, we consider every sleep longer than 50 milliseconds63* as perfect; there are no power gains for sleeping longer than this)64*65* For these two reasons we keep an array of 12 independent factors, that gets66* indexed based on the magnitude of the expected duration as well as the67* "is IO outstanding" property.68*69* Repeatable-interval-detector70* ----------------------------71* There are some cases where "next timer" is a completely unusable predictor:72* Those cases where the interval is fixed, for example due to hardware73* interrupt mitigation, but also due to fixed transfer rate devices such as74* mice.75* For this, we use a different predictor: We track the duration of the last 876* intervals and if the stand deviation of these 8 intervals is below a77* threshold value, we use the average of these intervals as prediction.78*79* Limiting Performance Impact80* ---------------------------81* C states, especially those with large exit latencies, can have a real82* noticeable impact on workloads, which is not acceptable for most sysadmins,83* and in addition, less performance has a power price of its own.84*85* As a general rule of thumb, menu assumes that the following heuristic86* holds:87* The busier the system, the less impact of C states is acceptable88*89* This rule-of-thumb is implemented using a performance-multiplier:90* If the exit latency times the performance multiplier is longer than91* the predicted duration, the C state is not considered a candidate92* for selection due to a too high performance impact. So the higher93* this multiplier is, the longer we need to be idle to pick a deep C94* state, and thus the less likely a busy CPU will hit such a deep95* C state.96*97* Two factors are used in determing this multiplier:98* a value of 10 is added for each point of "per cpu load average" we have.99* a value of 5 points is added for each process that is waiting for100* IO on this CPU.101* (these values are experimentally determined)102*103* The load average factor gives a longer term (few seconds) input to the104* decision, while the iowait value gives a cpu local instantanious input.105* The iowait factor may look low, but realize that this is also already106* represented in the system load average.107*108*/109110struct menu_device {111int last_state_idx;112int needs_update;113114unsigned int expected_us;115u64 predicted_us;116unsigned int exit_us;117unsigned int bucket;118u64 correction_factor[BUCKETS];119u32 intervals[INTERVALS];120int interval_ptr;121};122123124#define LOAD_INT(x) ((x) >> FSHIFT)125#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)126127static int get_loadavg(void)128{129unsigned long this = this_cpu_load();130131132return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;133}134135static inline int which_bucket(unsigned int duration)136{137int bucket = 0;138139/*140* We keep two groups of stats; one with no141* IO pending, one without.142* This allows us to calculate143* E(duration)|iowait144*/145if (nr_iowait_cpu(smp_processor_id()))146bucket = BUCKETS/2;147148if (duration < 10)149return bucket;150if (duration < 100)151return bucket + 1;152if (duration < 1000)153return bucket + 2;154if (duration < 10000)155return bucket + 3;156if (duration < 100000)157return bucket + 4;158return bucket + 5;159}160161/*162* Return a multiplier for the exit latency that is intended163* to take performance requirements into account.164* The more performance critical we estimate the system165* to be, the higher this multiplier, and thus the higher166* the barrier to go to an expensive C state.167*/168static inline int performance_multiplier(void)169{170int mult = 1;171172/* for higher loadavg, we are more reluctant */173174mult += 2 * get_loadavg();175176/* for IO wait tasks (per cpu!) we add 5x each */177mult += 10 * nr_iowait_cpu(smp_processor_id());178179return mult;180}181182static DEFINE_PER_CPU(struct menu_device, menu_devices);183184static void menu_update(struct cpuidle_device *dev);185186/* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */187static u64 div_round64(u64 dividend, u32 divisor)188{189return div_u64(dividend + (divisor / 2), divisor);190}191192/*193* Try detecting repeating patterns by keeping track of the last 8194* intervals, and checking if the standard deviation of that set195* of points is below a threshold. If it is... then use the196* average of these 8 points as the estimated value.197*/198static void detect_repeating_patterns(struct menu_device *data)199{200int i;201uint64_t avg = 0;202uint64_t stddev = 0; /* contains the square of the std deviation */203204/* first calculate average and standard deviation of the past */205for (i = 0; i < INTERVALS; i++)206avg += data->intervals[i];207avg = avg / INTERVALS;208209/* if the avg is beyond the known next tick, it's worthless */210if (avg > data->expected_us)211return;212213for (i = 0; i < INTERVALS; i++)214stddev += (data->intervals[i] - avg) *215(data->intervals[i] - avg);216217stddev = stddev / INTERVALS;218219/*220* now.. if stddev is small.. then assume we have a221* repeating pattern and predict we keep doing this.222*/223224if (avg && stddev < STDDEV_THRESH)225data->predicted_us = avg;226}227228/**229* menu_select - selects the next idle state to enter230* @dev: the CPU231*/232static int menu_select(struct cpuidle_device *dev)233{234struct menu_device *data = &__get_cpu_var(menu_devices);235int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);236unsigned int power_usage = -1;237int i;238int multiplier;239struct timespec t;240241if (data->needs_update) {242menu_update(dev);243data->needs_update = 0;244}245246data->last_state_idx = 0;247data->exit_us = 0;248249/* Special case when user has set very strict latency requirement */250if (unlikely(latency_req == 0))251return 0;252253/* determine the expected residency time, round up */254t = ktime_to_timespec(tick_nohz_get_sleep_length());255data->expected_us =256t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;257258259data->bucket = which_bucket(data->expected_us);260261multiplier = performance_multiplier();262263/*264* if the correction factor is 0 (eg first time init or cpu hotplug265* etc), we actually want to start out with a unity factor.266*/267if (data->correction_factor[data->bucket] == 0)268data->correction_factor[data->bucket] = RESOLUTION * DECAY;269270/* Make sure to round up for half microseconds */271data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],272RESOLUTION * DECAY);273274detect_repeating_patterns(data);275276/*277* We want to default to C1 (hlt), not to busy polling278* unless the timer is happening really really soon.279*/280if (data->expected_us > 5)281data->last_state_idx = CPUIDLE_DRIVER_STATE_START;282283/*284* Find the idle state with the lowest power while satisfying285* our constraints.286*/287for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {288struct cpuidle_state *s = &dev->states[i];289290if (s->flags & CPUIDLE_FLAG_IGNORE)291continue;292if (s->target_residency > data->predicted_us)293continue;294if (s->exit_latency > latency_req)295continue;296if (s->exit_latency * multiplier > data->predicted_us)297continue;298299if (s->power_usage < power_usage) {300power_usage = s->power_usage;301data->last_state_idx = i;302data->exit_us = s->exit_latency;303}304}305306return data->last_state_idx;307}308309/**310* menu_reflect - records that data structures need update311* @dev: the CPU312*313* NOTE: it's important to be fast here because this operation will add to314* the overall exit latency.315*/316static void menu_reflect(struct cpuidle_device *dev)317{318struct menu_device *data = &__get_cpu_var(menu_devices);319data->needs_update = 1;320}321322/**323* menu_update - attempts to guess what happened after entry324* @dev: the CPU325*/326static void menu_update(struct cpuidle_device *dev)327{328struct menu_device *data = &__get_cpu_var(menu_devices);329int last_idx = data->last_state_idx;330unsigned int last_idle_us = cpuidle_get_last_residency(dev);331struct cpuidle_state *target = &dev->states[last_idx];332unsigned int measured_us;333u64 new_factor;334335/*336* Ugh, this idle state doesn't support residency measurements, so we337* are basically lost in the dark. As a compromise, assume we slept338* for the whole expected time.339*/340if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID)))341last_idle_us = data->expected_us;342343344measured_us = last_idle_us;345346/*347* We correct for the exit latency; we are assuming here that the348* exit latency happens after the event that we're interested in.349*/350if (measured_us > data->exit_us)351measured_us -= data->exit_us;352353354/* update our correction ratio */355356new_factor = data->correction_factor[data->bucket]357* (DECAY - 1) / DECAY;358359if (data->expected_us > 0 && measured_us < MAX_INTERESTING)360new_factor += RESOLUTION * measured_us / data->expected_us;361else362/*363* we were idle so long that we count it as a perfect364* prediction365*/366new_factor += RESOLUTION;367368/*369* We don't want 0 as factor; we always want at least370* a tiny bit of estimated time.371*/372if (new_factor == 0)373new_factor = 1;374375data->correction_factor[data->bucket] = new_factor;376377/* update the repeating-pattern data */378data->intervals[data->interval_ptr++] = last_idle_us;379if (data->interval_ptr >= INTERVALS)380data->interval_ptr = 0;381}382383/**384* menu_enable_device - scans a CPU's states and does setup385* @dev: the CPU386*/387static int menu_enable_device(struct cpuidle_device *dev)388{389struct menu_device *data = &per_cpu(menu_devices, dev->cpu);390391memset(data, 0, sizeof(struct menu_device));392393return 0;394}395396static struct cpuidle_governor menu_governor = {397.name = "menu",398.rating = 20,399.enable = menu_enable_device,400.select = menu_select,401.reflect = menu_reflect,402.owner = THIS_MODULE,403};404405/**406* init_menu - initializes the governor407*/408static int __init init_menu(void)409{410return cpuidle_register_governor(&menu_governor);411}412413/**414* exit_menu - exits the governor415*/416static void __exit exit_menu(void)417{418cpuidle_unregister_governor(&menu_governor);419}420421MODULE_LICENSE("GPL");422module_init(init_menu);423module_exit(exit_menu);424425426