/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Header file for the BFQ I/O scheduler: data structures and3* prototypes of interface functions among BFQ components.4*/5#ifndef _BFQ_H6#define _BFQ_H78#include <linux/blktrace_api.h>9#include <linux/hrtimer.h>1011#include "blk-cgroup-rwstat.h"1213#define BFQ_IOPRIO_CLASSES 314#define BFQ_CL_IDLE_TIMEOUT (HZ/5)1516#define BFQ_MIN_WEIGHT 117#define BFQ_MAX_WEIGHT 100018#define BFQ_WEIGHT_CONVERSION_COEFF 101920#define BFQ_DEFAULT_QUEUE_IOPRIO 42122#define BFQ_DEFAULT_GRP_IOPRIO 023#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE2425#define MAX_BFQQ_NAME_LENGTH 162627/*28* Soft real-time applications are extremely more latency sensitive29* than interactive ones. Over-raise the weight of the former to30* privilege them against the latter.31*/32#define BFQ_SOFTRT_WEIGHT_FACTOR 1003334/*35* Maximum number of actuators supported. This constant is used simply36* to define the size of the static array that will contain37* per-actuator data. The current value is hopefully a good upper38* bound to the possible number of actuators of any actual drive.39*/40#define BFQ_MAX_ACTUATORS 84142struct bfq_entity;4344/**45* struct bfq_service_tree - per ioprio_class service tree.46*47* Each service tree represents a B-WF2Q+ scheduler on its own. Each48* ioprio_class has its own independent scheduler, and so its own49* bfq_service_tree. All the fields are protected by the queue lock50* of the containing bfqd.51*/52struct bfq_service_tree {53/* tree for active entities (i.e., those backlogged) */54struct rb_root active;55/* tree for idle entities (i.e., not backlogged, with V < F_i)*/56struct rb_root idle;5758/* idle entity with minimum F_i */59struct bfq_entity *first_idle;60/* idle entity with maximum F_i */61struct bfq_entity *last_idle;6263/* scheduler virtual time */64u64 vtime;65/* scheduler weight sum; active and idle entities contribute to it */66unsigned long wsum;67};6869/**70* struct bfq_sched_data - multi-class scheduler.71*72* bfq_sched_data is the basic scheduler queue. It supports three73* ioprio_classes, and can be used either as a toplevel queue or as an74* intermediate queue in a hierarchical setup.75*76* The supported ioprio_classes are the same as in CFQ, in descending77* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.78* Requests from higher priority queues are served before all the79* requests from lower priority queues; among requests of the same80* queue requests are served according to B-WF2Q+.81*82* The schedule is implemented by the service trees, plus the field83* @next_in_service, which points to the entity on the active trees84* that will be served next, if 1) no changes in the schedule occurs85* before the current in-service entity is expired, 2) the in-service86* queue becomes idle when it expires, and 3) if the entity pointed by87* in_service_entity is not a queue, then the in-service child entity88* of the entity pointed by in_service_entity becomes idle on89* expiration. This peculiar definition allows for the following90* optimization, not yet exploited: while a given entity is still in91* service, we already know which is the best candidate for next92* service among the other active entities in the same parent93* entity. We can then quickly compare the timestamps of the94* in-service entity with those of such best candidate.95*96* All fields are protected by the lock of the containing bfqd.97*/98struct bfq_sched_data {99/* entity in service */100struct bfq_entity *in_service_entity;101/* head-of-line entity (see comments above) */102struct bfq_entity *next_in_service;103/* array of service trees, one per ioprio_class */104struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];105/* last time CLASS_IDLE was served */106unsigned long bfq_class_idle_last_service;107108};109110/**111* struct bfq_weight_counter - counter of the number of all active queues112* with a given weight.113*/114struct bfq_weight_counter {115unsigned int weight; /* weight of the queues this counter refers to */116unsigned int num_active; /* nr of active queues with this weight */117/*118* Weights tree member (see bfq_data's @queue_weights_tree)119*/120struct rb_node weights_node;121};122123/**124* struct bfq_entity - schedulable entity.125*126* A bfq_entity is used to represent either a bfq_queue (leaf node in the127* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each128* entity belongs to the sched_data of the parent group in the cgroup129* hierarchy. Non-leaf entities have also their own sched_data, stored130* in @my_sched_data.131*132* Each entity stores independently its priority values; this would133* allow different weights on different devices, but this134* functionality is not exported to userspace by now. Priorities and135* weights are updated lazily, first storing the new values into the136* new_* fields, then setting the @prio_changed flag. As soon as137* there is a transition in the entity state that allows the priority138* update to take place the effective and the requested priority139* values are synchronized.140*141* Unless cgroups are used, the weight value is calculated from the142* ioprio to export the same interface as CFQ. When dealing with143* "well-behaved" queues (i.e., queues that do not spend too much144* time to consume their budget and have true sequential behavior, and145* when there are no external factors breaking anticipation) the146* relative weights at each level of the cgroups hierarchy should be147* guaranteed. All the fields are protected by the queue lock of the148* containing bfqd.149*/150struct bfq_entity {151/* service_tree member */152struct rb_node rb_node;153154/*155* Flag, true if the entity is on a tree (either the active or156* the idle one of its service_tree) or is in service.157*/158bool on_st_or_in_serv;159160/* B-WF2Q+ start and finish timestamps [sectors/weight] */161u64 start, finish;162163/* tree the entity is enqueued into; %NULL if not on a tree */164struct rb_root *tree;165166/*167* minimum start time of the (active) subtree rooted at this168* entity; used for O(log N) lookups into active trees169*/170u64 min_start;171172/* amount of service received during the last service slot */173int service;174175/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */176int budget;177178/* Number of requests allocated in the subtree of this entity */179int allocated;180181/* device weight, if non-zero, it overrides the default weight of182* bfq_group_data */183int dev_weight;184/* weight of the queue */185int weight;186/* next weight if a change is in progress */187int new_weight;188189/* original weight, used to implement weight boosting */190int orig_weight;191192/* parent entity, for hierarchical scheduling */193struct bfq_entity *parent;194195/*196* For non-leaf nodes in the hierarchy, the associated197* scheduler queue, %NULL on leaf nodes.198*/199struct bfq_sched_data *my_sched_data;200/* the scheduler queue this entity belongs to */201struct bfq_sched_data *sched_data;202203/* flag, set to request a weight, ioprio or ioprio_class change */204int prio_changed;205206#ifdef CONFIG_BFQ_GROUP_IOSCHED207/* flag, set if the entity is counted in groups_with_pending_reqs */208bool in_groups_with_pending_reqs;209#endif210211/* last child queue of entity created (for non-leaf entities) */212struct bfq_queue *last_bfqq_created;213};214215struct bfq_group;216217/**218* struct bfq_ttime - per process thinktime stats.219*/220struct bfq_ttime {221/* completion time of the last request */222u64 last_end_request;223224/* total process thinktime */225u64 ttime_total;226/* number of thinktime samples */227unsigned long ttime_samples;228/* average process thinktime */229u64 ttime_mean;230};231232/**233* struct bfq_queue - leaf schedulable entity.234*235* A bfq_queue is a leaf request queue; it can be associated with an236* io_context or more, if it is async or shared between cooperating237* processes. Besides, it contains I/O requests for only one actuator238* (an io_context is associated with a different bfq_queue for each239* actuator it generates I/O for). @cgroup holds a reference to the240* cgroup, to be sure that it does not disappear while a bfqq still241* references it (mostly to avoid races between request issuing and242* task migration followed by cgroup destruction). All the fields are243* protected by the queue lock of the containing bfqd.244*/245struct bfq_queue {246/* reference counter */247int ref;248/* counter of references from other queues for delayed stable merge */249int stable_ref;250/* parent bfq_data */251struct bfq_data *bfqd;252253/* current ioprio and ioprio class */254unsigned short ioprio, ioprio_class;255/* next ioprio and ioprio class if a change is in progress */256unsigned short new_ioprio, new_ioprio_class;257258/* last total-service-time sample, see bfq_update_inject_limit() */259u64 last_serv_time_ns;260/* limit for request injection */261unsigned int inject_limit;262/* last time the inject limit has been decreased, in jiffies */263unsigned long decrease_time_jif;264265/*266* Shared bfq_queue if queue is cooperating with one or more267* other queues.268*/269struct bfq_queue *new_bfqq;270/* request-position tree member (see bfq_group's @rq_pos_tree) */271struct rb_node pos_node;272/* request-position tree root (see bfq_group's @rq_pos_tree) */273struct rb_root *pos_root;274275/* sorted list of pending requests */276struct rb_root sort_list;277/* if fifo isn't expired, next request to serve */278struct request *next_rq;279/* number of sync and async requests queued */280int queued[2];281/* number of pending metadata requests */282int meta_pending;283/* fifo list of requests in sort_list */284struct list_head fifo;285286/* entity representing this queue in the scheduler */287struct bfq_entity entity;288289/* pointer to the weight counter associated with this entity */290struct bfq_weight_counter *weight_counter;291292/* maximum budget allowed from the feedback mechanism */293int max_budget;294/* budget expiration (in jiffies) */295unsigned long budget_timeout;296297/* number of requests on the dispatch list or inside driver */298int dispatched;299300/* status flags */301unsigned long flags;302303/* node for active/idle bfqq list inside parent bfqd */304struct list_head bfqq_list;305306/* associated @bfq_ttime struct */307struct bfq_ttime ttime;308309/* when bfqq started to do I/O within the last observation window */310u64 io_start_time;311/* how long bfqq has remained empty during the last observ. window */312u64 tot_idle_time;313314/* bit vector: a 1 for each seeky requests in history */315u32 seek_history;316317/* node for the device's burst list */318struct hlist_node burst_list_node;319320/* position of the last request enqueued */321sector_t last_request_pos;322323/* Number of consecutive pairs of request completion and324* arrival, such that the queue becomes idle after the325* completion, but the next request arrives within an idle326* time slice; used only if the queue's IO_bound flag has been327* cleared.328*/329unsigned int requests_within_timer;330331/* pid of the process owning the queue, used for logging purposes */332pid_t pid;333334/*335* Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL336* if the queue is shared.337*/338struct bfq_io_cq *bic;339340/* current maximum weight-raising time for this queue */341unsigned long wr_cur_max_time;342/*343* Minimum time instant such that, only if a new request is344* enqueued after this time instant in an idle @bfq_queue with345* no outstanding requests, then the task associated with the346* queue it is deemed as soft real-time (see the comments on347* the function bfq_bfqq_softrt_next_start())348*/349unsigned long soft_rt_next_start;350/*351* Start time of the current weight-raising period if352* the @bfq-queue is being weight-raised, otherwise353* finish time of the last weight-raising period.354*/355unsigned long last_wr_start_finish;356/* factor by which the weight of this queue is multiplied */357unsigned int wr_coeff;358/*359* Time of the last transition of the @bfq_queue from idle to360* backlogged.361*/362unsigned long last_idle_bklogged;363/*364* Cumulative service received from the @bfq_queue since the365* last transition from idle to backlogged.366*/367unsigned long service_from_backlogged;368/*369* Cumulative service received from the @bfq_queue since its370* last transition to weight-raised state.371*/372unsigned long service_from_wr;373374/*375* Value of wr start time when switching to soft rt376*/377unsigned long wr_start_at_switch_to_srt;378379unsigned long split_time; /* time of last split */380381unsigned long first_IO_time; /* time of first I/O for this queue */382unsigned long creation_time; /* when this queue is created */383384/*385* Pointer to the waker queue for this queue, i.e., to the386* queue Q such that this queue happens to get new I/O right387* after some I/O request of Q is completed. For details, see388* the comments on the choice of the queue for injection in389* bfq_select_queue().390*/391struct bfq_queue *waker_bfqq;392/* pointer to the curr. tentative waker queue, see bfq_check_waker() */393struct bfq_queue *tentative_waker_bfqq;394/* number of times the same tentative waker has been detected */395unsigned int num_waker_detections;396/* time when we started considering this waker */397u64 waker_detection_started;398399/* node for woken_list, see below */400struct hlist_node woken_list_node;401/*402* Head of the list of the woken queues for this queue, i.e.,403* of the list of the queues for which this queue is a waker404* queue. This list is used to reset the waker_bfqq pointer in405* the woken queues when this queue exits.406*/407struct hlist_head woken_list;408409/* index of the actuator this queue is associated with */410unsigned int actuator_idx;411};412413/**414* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq415*/416struct bfq_iocq_bfqq_data {417/*418* Snapshot of the has_short_time flag before merging; taken419* to remember its values while the queue is merged, so as to420* be able to restore it in case of split.421*/422bool saved_has_short_ttime;423/*424* Same purpose as the previous two fields for the I/O bound425* classification of a queue.426*/427bool saved_IO_bound;428429/*430* Same purpose as the previous fields for the values of the431* field keeping the queue's belonging to a large burst432*/433bool saved_in_large_burst;434/*435* True if the queue belonged to a burst list before its merge436* with another cooperating queue.437*/438bool was_in_burst_list;439440/*441* Save the weight when a merge occurs, to be able442* to restore it in case of split. If the weight is not443* correctly resumed when the queue is recycled,444* then the weight of the recycled queue could differ445* from the weight of the original queue.446*/447unsigned int saved_weight;448449u64 saved_io_start_time;450u64 saved_tot_idle_time;451452/*453* Similar to previous fields: save wr information.454*/455unsigned long saved_wr_coeff;456unsigned long saved_last_wr_start_finish;457unsigned long saved_service_from_wr;458unsigned long saved_wr_start_at_switch_to_srt;459struct bfq_ttime saved_ttime;460unsigned int saved_wr_cur_max_time;461462/* Save also injection state */463unsigned int saved_inject_limit;464unsigned long saved_decrease_time_jif;465u64 saved_last_serv_time_ns;466467/* candidate queue for a stable merge (due to close creation time) */468struct bfq_queue *stable_merge_bfqq;469470bool stably_merged; /* non splittable if true */471};472473/**474* struct bfq_io_cq - per (request_queue, io_context) structure.475*/476struct bfq_io_cq {477/* associated io_cq structure */478struct io_cq icq; /* must be the first member */479/*480* Matrix of associated process queues: first row for async481* queues, second row sync queues. Each row contains one482* column for each actuator. An I/O request generated by the483* process is inserted into the queue pointed by bfqq[i][j] if484* the request is to be served by the j-th actuator of the485* drive, where i==0 or i==1, depending on whether the request486* is async or sync. So there is a distinct queue for each487* actuator.488*/489struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];490/* per (request_queue, blkcg) ioprio */491int ioprio;492#ifdef CONFIG_BFQ_GROUP_IOSCHED493uint64_t blkcg_serial_nr; /* the current blkcg serial */494#endif495496/*497* Persistent data for associated synchronous process queues498* (one queue per actuator, see field bfqq above). In499* particular, each of these queues may undergo a merge.500*/501struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];502503unsigned int requests; /* Number of requests this process has in flight */504};505506/**507* struct bfq_data - per-device data structure.508*509* All the fields are protected by @lock.510*/511struct bfq_data {512/* device request queue */513struct request_queue *queue;514/* dispatch queue */515struct list_head dispatch;516517/* root bfq_group for the device */518struct bfq_group *root_group;519520/*521* rbtree of weight counters of @bfq_queues, sorted by522* weight. Used to keep track of whether all @bfq_queues have523* the same weight. The tree contains one counter for each524* distinct weight associated to some active and not525* weight-raised @bfq_queue (see the comments to the functions526* bfq_weights_tree_[add|remove] for further details).527*/528struct rb_root_cached queue_weights_tree;529530#ifdef CONFIG_BFQ_GROUP_IOSCHED531/*532* Number of groups with at least one process that533* has at least one request waiting for completion. Note that534* this accounts for also requests already dispatched, but not535* yet completed. Therefore this number of groups may differ536* (be larger) than the number of active groups, as a group is537* considered active only if its corresponding entity has538* queues with at least one request queued. This539* number is used to decide whether a scenario is symmetric.540* For a detailed explanation see comments on the computation541* of the variable asymmetric_scenario in the function542* bfq_better_to_idle().543*544* However, it is hard to compute this number exactly, for545* groups with multiple processes. Consider a group546* that is inactive, i.e., that has no process with547* pending I/O inside BFQ queues. Then suppose that548* num_groups_with_pending_reqs is still accounting for this549* group, because the group has processes with some550* I/O request still in flight. num_groups_with_pending_reqs551* should be decremented when the in-flight request of the552* last process is finally completed (assuming that553* nothing else has changed for the group in the meantime, in554* terms of composition of the group and active/inactive state of child555* groups and processes). To accomplish this, an additional556* pending-request counter must be added to entities, and must557* be updated correctly. To avoid this additional field and operations,558* we resort to the following tradeoff between simplicity and559* accuracy: for an inactive group that is still counted in560* num_groups_with_pending_reqs, we decrement561* num_groups_with_pending_reqs when the first562* process of the group remains with no request waiting for563* completion.564*565* Even this simpler decrement strategy requires a little566* carefulness: to avoid multiple decrements, we flag a group,567* more precisely an entity representing a group, as still568* counted in num_groups_with_pending_reqs when it becomes569* inactive. Then, when the first queue of the570* entity remains with no request waiting for completion,571* num_groups_with_pending_reqs is decremented, and this flag572* is reset. After this flag is reset for the entity,573* num_groups_with_pending_reqs won't be decremented any574* longer in case a new queue of the entity remains575* with no request waiting for completion.576*/577unsigned int num_groups_with_pending_reqs;578#endif579580/*581* Per-class (RT, BE, IDLE) number of bfq_queues containing582* requests (including the queue in service, even if it is583* idling).584*/585unsigned int busy_queues[3];586/* number of weight-raised busy @bfq_queues */587int wr_busy_queues;588/* number of queued requests */589int queued;590/* number of requests dispatched and waiting for completion */591int tot_rq_in_driver;592/*593* number of requests dispatched and waiting for completion594* for each actuator595*/596int rq_in_driver[BFQ_MAX_ACTUATORS];597598/* true if the device is non rotational and performs queueing */599bool nonrot_with_queueing;600601/*602* Maximum number of requests in driver in the last603* @hw_tag_samples completed requests.604*/605int max_rq_in_driver;606/* number of samples used to calculate hw_tag */607int hw_tag_samples;608/* flag set to one if the driver is showing a queueing behavior */609int hw_tag;610611/* number of budgets assigned */612int budgets_assigned;613614/*615* Timer set when idling (waiting) for the next request from616* the queue in service.617*/618struct hrtimer idle_slice_timer;619620/* bfq_queue in service */621struct bfq_queue *in_service_queue;622623/* on-disk position of the last served request */624sector_t last_position;625626/* position of the last served request for the in-service queue */627sector_t in_serv_last_pos;628629/* time of last request completion (ns) */630u64 last_completion;631632/* bfqq owning the last completed rq */633struct bfq_queue *last_completed_rq_bfqq;634635/* last bfqq created, among those in the root group */636struct bfq_queue *last_bfqq_created;637638/* time of last transition from empty to non-empty (ns) */639u64 last_empty_occupied_ns;640641/*642* Flag set to activate the sampling of the total service time643* of a just-arrived first I/O request (see644* bfq_update_inject_limit()). This will cause the setting of645* waited_rq when the request is finally dispatched.646*/647bool wait_dispatch;648/*649* If set, then bfq_update_inject_limit() is invoked when650* waited_rq is eventually completed.651*/652struct request *waited_rq;653/*654* True if some request has been injected during the last service hole.655*/656bool rqs_injected;657658/* time of first rq dispatch in current observation interval (ns) */659u64 first_dispatch;660/* time of last rq dispatch in current observation interval (ns) */661u64 last_dispatch;662663/* beginning of the last budget */664ktime_t last_budget_start;665/* beginning of the last idle slice */666ktime_t last_idling_start;667unsigned long last_idling_start_jiffies;668669/* number of samples in current observation interval */670int peak_rate_samples;671/* num of samples of seq dispatches in current observation interval */672u32 sequential_samples;673/* total num of sectors transferred in current observation interval */674u64 tot_sectors_dispatched;675/* max rq size seen during current observation interval (sectors) */676u32 last_rq_max_size;677/* time elapsed from first dispatch in current observ. interval (us) */678u64 delta_from_first;679/*680* Current estimate of the device peak rate, measured in681* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by682* BFQ_RATE_SHIFT is performed to increase precision in683* fixed-point calculations.684*/685u32 peak_rate;686687/* maximum budget allotted to a bfq_queue before rescheduling */688int bfq_max_budget;689690/*691* List of all the bfq_queues active for a specific actuator692* on the device. Keeping active queues separate on a693* per-actuator basis helps implementing per-actuator694* injection more efficiently.695*/696struct list_head active_list[BFQ_MAX_ACTUATORS];697/* list of all the bfq_queues idle on the device */698struct list_head idle_list;699700/*701* Timeout for async/sync requests; when it fires, requests702* are served in fifo order.703*/704u64 bfq_fifo_expire[2];705/* weight of backward seeks wrt forward ones */706unsigned int bfq_back_penalty;707/* maximum allowed backward seek */708unsigned int bfq_back_max;709/* maximum idling time */710u32 bfq_slice_idle;711712/* user-configured max budget value (0 for auto-tuning) */713int bfq_user_max_budget;714/*715* Timeout for bfq_queues to consume their budget; used to716* prevent seeky queues from imposing long latencies to717* sequential or quasi-sequential ones (this also implies that718* seeky queues cannot receive guarantees in the service719* domain; after a timeout they are charged for the time they720* have been in service, to preserve fairness among them, but721* without service-domain guarantees).722*/723unsigned int bfq_timeout;724725/*726* Force device idling whenever needed to provide accurate727* service guarantees, without caring about throughput728* issues. CAVEAT: this may even increase latencies, in case729* of useless idling for processes that did stop doing I/O.730*/731bool strict_guarantees;732733/*734* Last time at which a queue entered the current burst of735* queues being activated shortly after each other; for more736* details about this and the following parameters related to737* a burst of activations, see the comments on the function738* bfq_handle_burst.739*/740unsigned long last_ins_in_burst;741/*742* Reference time interval used to decide whether a queue has743* been activated shortly after @last_ins_in_burst.744*/745unsigned long bfq_burst_interval;746/* number of queues in the current burst of queue activations */747int burst_size;748749/* common parent entity for the queues in the burst */750struct bfq_entity *burst_parent_entity;751/* Maximum burst size above which the current queue-activation752* burst is deemed as 'large'.753*/754unsigned long bfq_large_burst_thresh;755/* true if a large queue-activation burst is in progress */756bool large_burst;757/*758* Head of the burst list (as for the above fields, more759* details in the comments on the function bfq_handle_burst).760*/761struct hlist_head burst_list;762763/* if set to true, low-latency heuristics are enabled */764bool low_latency;765/*766* Maximum factor by which the weight of a weight-raised queue767* is multiplied.768*/769unsigned int bfq_wr_coeff;770771/* Maximum weight-raising duration for soft real-time processes */772unsigned int bfq_wr_rt_max_time;773/*774* Minimum idle period after which weight-raising may be775* reactivated for a queue (in jiffies).776*/777unsigned int bfq_wr_min_idle_time;778/*779* Minimum period between request arrivals after which780* weight-raising may be reactivated for an already busy async781* queue (in jiffies).782*/783unsigned long bfq_wr_min_inter_arr_async;784785/* Max service-rate for a soft real-time queue, in sectors/sec */786unsigned int bfq_wr_max_softrt_rate;787/*788* Cached value of the product ref_rate*ref_wr_duration, used789* for computing the maximum duration of weight raising790* automatically.791*/792u64 rate_dur_prod;793794/* fallback dummy bfqq for extreme OOM conditions */795struct bfq_queue oom_bfqq;796797spinlock_t lock;798799/*800* bic associated with the task issuing current bio for801* merging. This and the next field are used as a support to802* be able to perform the bic lookup, needed by bio-merge803* functions, before the scheduler lock is taken, and thus804* avoid taking the request-queue lock while the scheduler805* lock is being held.806*/807struct bfq_io_cq *bio_bic;808/* bfqq associated with the task issuing current bio for merging */809struct bfq_queue *bio_bfqq;810811/*812* Depth limits used in bfq_limit_depth (see comments on the813* function)814*/815unsigned int async_depths[2][2];816817/*818* Number of independent actuators. This is equal to 1 in819* case of single-actuator drives.820*/821unsigned int num_actuators;822/*823* Disk independent access ranges for each actuator824* in this device.825*/826sector_t sector[BFQ_MAX_ACTUATORS];827sector_t nr_sectors[BFQ_MAX_ACTUATORS];828struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];829830/*831* If the number of I/O requests queued in the device for a832* given actuator is below next threshold, then the actuator833* is deemed as underutilized. If this condition is found to834* hold for some actuator upon a dispatch, but (i) the835* in-service queue does not contain I/O for that actuator,836* while (ii) some other queue does contain I/O for that837* actuator, then the head I/O request of the latter queue is838* returned (injected), instead of the head request of the839* currently in-service queue.840*841* We set the threshold, empirically, to the minimum possible842* value for which an actuator is fully utilized, or close to843* be fully utilized. By doing so, injected I/O 'steals' as844* few drive-queue slots as possibile to the in-service845* queue. This reduces as much as possible the probability846* that the service of I/O from the in-service bfq_queue gets847* delayed because of slot exhaustion, i.e., because all the848* slots of the drive queue are filled with I/O injected from849* other queues (NCQ provides for 32 slots).850*/851unsigned int actuator_load_threshold;852};853854enum bfqq_state_flags {855BFQQF_just_created = 0, /* queue just allocated */856BFQQF_busy, /* has requests or is in service */857BFQQF_wait_request, /* waiting for a request */858BFQQF_non_blocking_wait_rq, /*859* waiting for a request860* without idling the device861*/862BFQQF_fifo_expire, /* FIFO checked in this slice */863BFQQF_has_short_ttime, /* queue has a short think time */864BFQQF_sync, /* synchronous queue */865BFQQF_IO_bound, /*866* bfqq has timed-out at least once867* having consumed at most 2/10 of868* its budget869*/870BFQQF_in_large_burst, /*871* bfqq activated in a large burst,872* see comments to bfq_handle_burst.873*/874BFQQF_softrt_update, /*875* may need softrt-next-start876* update877*/878BFQQF_coop, /* bfqq is shared */879BFQQF_split_coop, /* shared bfqq will be split */880};881882#define BFQ_BFQQ_FNS(name) \883void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \884void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \885int bfq_bfqq_##name(const struct bfq_queue *bfqq);886887BFQ_BFQQ_FNS(just_created);888BFQ_BFQQ_FNS(busy);889BFQ_BFQQ_FNS(wait_request);890BFQ_BFQQ_FNS(non_blocking_wait_rq);891BFQ_BFQQ_FNS(fifo_expire);892BFQ_BFQQ_FNS(has_short_ttime);893BFQ_BFQQ_FNS(sync);894BFQ_BFQQ_FNS(IO_bound);895BFQ_BFQQ_FNS(in_large_burst);896BFQ_BFQQ_FNS(coop);897BFQ_BFQQ_FNS(split_coop);898BFQ_BFQQ_FNS(softrt_update);899#undef BFQ_BFQQ_FNS900901/* Expiration reasons. */902enum bfqq_expiration {903BFQQE_TOO_IDLE = 0, /*904* queue has been idling for905* too long906*/907BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */908BFQQE_BUDGET_EXHAUSTED, /* budget consumed */909BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */910BFQQE_PREEMPTED /* preemption in progress */911};912913struct bfq_stat {914struct percpu_counter cpu_cnt;915atomic64_t aux_cnt;916};917918struct bfqg_stats {919/* basic stats */920struct blkg_rwstat bytes;921struct blkg_rwstat ios;922#ifdef CONFIG_BFQ_CGROUP_DEBUG923/* number of ios merged */924struct blkg_rwstat merged;925/* total time spent on device in ns, may not be accurate w/ queueing */926struct blkg_rwstat service_time;927/* total time spent waiting in scheduler queue in ns */928struct blkg_rwstat wait_time;929/* number of IOs queued up */930struct blkg_rwstat queued;931/* total disk time and nr sectors dispatched by this group */932struct bfq_stat time;933/* sum of number of ios queued across all samples */934struct bfq_stat avg_queue_size_sum;935/* count of samples taken for average */936struct bfq_stat avg_queue_size_samples;937/* how many times this group has been removed from service tree */938struct bfq_stat dequeue;939/* total time spent waiting for it to be assigned a timeslice. */940struct bfq_stat group_wait_time;941/* time spent idling for this blkcg_gq */942struct bfq_stat idle_time;943/* total time with empty current active q with other requests queued */944struct bfq_stat empty_time;945/* fields after this shouldn't be cleared on stat reset */946u64 start_group_wait_time;947u64 start_idle_time;948u64 start_empty_time;949uint16_t flags;950#endif /* CONFIG_BFQ_CGROUP_DEBUG */951};952953#ifdef CONFIG_BFQ_GROUP_IOSCHED954955/*956* struct bfq_group_data - per-blkcg storage for the blkio subsystem.957*958* @ps: @blkcg_policy_storage that this structure inherits959* @weight: weight of the bfq_group960*/961struct bfq_group_data {962/* must be the first member */963struct blkcg_policy_data pd;964965unsigned int weight;966};967968/**969* struct bfq_group - per (device, cgroup) data structure.970* @entity: schedulable entity to insert into the parent group sched_data.971* @sched_data: own sched_data, to contain child entities (they may be972* both bfq_queues and bfq_groups).973* @bfqd: the bfq_data for the device this group acts upon.974* @async_bfqq: array of async queues for all the tasks belonging to975* the group, one queue per ioprio value per ioprio_class,976* except for the idle class that has only one queue.977* @async_idle_bfqq: async queue for the idle class (ioprio is ignored).978* @my_entity: pointer to @entity, %NULL for the toplevel group; used979* to avoid too many special cases during group creation/980* migration.981* @stats: stats for this bfqg.982* @active_entities: number of active entities belonging to the group;983* unused for the root group. Used to know whether there984* are groups with more than one active @bfq_entity985* (see the comments to the function986* bfq_bfqq_may_idle()).987* @rq_pos_tree: rbtree sorted by next_request position, used when988* determining if two or more queues have interleaving989* requests (see bfq_find_close_cooperator()).990*991* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup992* there is a set of bfq_groups, each one collecting the lower-level993* entities belonging to the group that are acting on the same device.994*995* Locking works as follows:996* o @bfqd is protected by the queue lock, RCU is used to access it997* from the readers.998* o All the other fields are protected by the @bfqd queue lock.999*/1000struct bfq_group {1001/* must be the first member */1002struct blkg_policy_data pd;10031004/* reference counter (see comments in bfq_bic_update_cgroup) */1005refcount_t ref;10061007struct bfq_entity entity;1008struct bfq_sched_data sched_data;10091010struct bfq_data *bfqd;10111012struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];1013struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];10141015struct bfq_entity *my_entity;10161017int active_entities;1018int num_queues_with_pending_reqs;10191020struct rb_root rq_pos_tree;10211022struct bfqg_stats stats;1023};10241025#else1026struct bfq_group {1027struct bfq_entity entity;1028struct bfq_sched_data sched_data;10291030struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];1031struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];10321033struct rb_root rq_pos_tree;1034};1035#endif10361037/* --------------- main algorithm interface ----------------- */10381039#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \1040{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })10411042extern const int bfq_timeout;10431044struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,1045unsigned int actuator_idx);1046void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,1047unsigned int actuator_idx);1048struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);1049void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);1050void bfq_weights_tree_add(struct bfq_queue *bfqq);1051void bfq_weights_tree_remove(struct bfq_queue *bfqq);1052void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,1053bool compensate, enum bfqq_expiration reason);1054void bfq_put_queue(struct bfq_queue *bfqq);1055void bfq_put_cooperator(struct bfq_queue *bfqq);1056void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);1057void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);1058void bfq_schedule_dispatch(struct bfq_data *bfqd);1059void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);10601061/* ------------ end of main algorithm interface -------------- */10621063/* ---------------- cgroups-support interface ---------------- */10641065void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);1066void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);1067void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);1068void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,1069u64 io_start_time_ns, blk_opf_t opf);1070void bfqg_stats_update_dequeue(struct bfq_group *bfqg);1071void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);1072void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,1073struct bfq_group *bfqg);10741075#ifdef CONFIG_BFQ_CGROUP_DEBUG1076void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,1077blk_opf_t opf);1078void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);1079void bfqg_stats_update_idle_time(struct bfq_group *bfqg);1080void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);1081#endif10821083void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);1084void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);1085void bfq_end_wr_async(struct bfq_data *bfqd);1086struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio);1087struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);1088struct bfq_group *bfqq_group(struct bfq_queue *bfqq);1089struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);1090void bfqg_and_blkg_put(struct bfq_group *bfqg);10911092#ifdef CONFIG_BFQ_GROUP_IOSCHED1093extern struct cftype bfq_blkcg_legacy_files[];1094extern struct cftype bfq_blkg_files[];1095extern struct blkcg_policy blkcg_policy_bfq;1096#endif10971098/* ------------- end of cgroups-support interface ------------- */10991100/* - interface of the internal hierarchical B-WF2Q+ scheduler - */11011102#ifdef CONFIG_BFQ_GROUP_IOSCHED1103/* both next loops stop at one of the child entities of the root group */1104#define for_each_entity(entity) \1105for (; entity ; entity = entity->parent)11061107/*1108* For each iteration, compute parent in advance, so as to be safe if1109* entity is deallocated during the iteration. Such a deallocation may1110* happen as a consequence of a bfq_put_queue that frees the bfq_queue1111* containing entity.1112*/1113#define for_each_entity_safe(entity, parent) \1114for (; entity && ({ parent = entity->parent; 1; }); entity = parent)11151116#else /* CONFIG_BFQ_GROUP_IOSCHED */1117/*1118* Next two macros are fake loops when cgroups support is not1119* enabled. I fact, in such a case, there is only one level to go up1120* (to reach the root group).1121*/1122#define for_each_entity(entity) \1123for (; entity ; entity = NULL)11241125#define for_each_entity_safe(entity, parent) \1126for (parent = NULL; entity ; entity = parent)1127#endif /* CONFIG_BFQ_GROUP_IOSCHED */11281129struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);1130unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);1131struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);1132struct bfq_entity *bfq_entity_of(struct rb_node *node);1133unsigned short bfq_ioprio_to_weight(int ioprio);1134void bfq_put_idle_entity(struct bfq_service_tree *st,1135struct bfq_entity *entity);1136struct bfq_service_tree *1137__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,1138struct bfq_entity *entity,1139bool update_class_too);1140void bfq_bfqq_served(struct bfq_queue *bfqq, int served);1141void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,1142unsigned long time_ms);1143bool __bfq_deactivate_entity(struct bfq_entity *entity,1144bool ins_into_idle_tree);1145bool next_queue_may_preempt(struct bfq_data *bfqd);1146struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);1147bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);1148void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,1149bool ins_into_idle_tree, bool expiration);1150void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);1151void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,1152bool expiration);1153void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);1154void bfq_add_bfqq_busy(struct bfq_queue *bfqq);1155void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);1156void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);1157void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,1158struct bfq_queue *new_bfqq);11591160/* --------------- end of interface of B-WF2Q+ ---------------- */11611162/* Logging facilities. */1163static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)1164{1165char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';11661167if (bfqq->pid != -1)1168snprintf(str, len, "bfq%d%c", bfqq->pid, type);1169else1170snprintf(str, len, "bfqSHARED-%c", type);1171}11721173#ifdef CONFIG_BFQ_GROUP_IOSCHED1174struct bfq_group *bfqq_group(struct bfq_queue *bfqq);11751176#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \1177char pid_str[MAX_BFQQ_NAME_LENGTH]; \1178if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \1179break; \1180bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \1181blk_add_cgroup_trace_msg((bfqd)->queue, \1182&bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \1183"%s " fmt, pid_str, ##args); \1184} while (0)11851186#else /* CONFIG_BFQ_GROUP_IOSCHED */11871188#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \1189char pid_str[MAX_BFQQ_NAME_LENGTH]; \1190if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \1191break; \1192bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \1193blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \1194} while (0)11951196#endif /* CONFIG_BFQ_GROUP_IOSCHED */11971198#define bfq_log(bfqd, fmt, args...) \1199blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)12001201#endif /* _BFQ_H */120212031204