CoCalc -- cfq-iosched.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/block/cfq-iosched.c
¹⁷⁴⁷⁹ views
1
/*
2
 *  CFQ, or complete fairness queueing, disk scheduler.
3
 *
4
 *  Based on ideas from a previously unfinished io
5
 *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6
 *
7
 *  Copyright (C) 2003 Jens Axboe <[email protected]>
8
 */
9
#include <linux/module.h>
10
#include <linux/slab.h>
11
#include <linux/blkdev.h>
12
#include <linux/elevator.h>
13
#include <linux/jiffies.h>
14
#include <linux/rbtree.h>
15
#include <linux/ioprio.h>
16
#include <linux/blktrace_api.h>
17
#include "cfq.h"
18

19
/*
20
 * tunables
21
 */
22
/* max queue in one round of service */
23
static const int cfq_quantum = 8;
24
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25
/* maximum backwards seek, in KiB */
26
static const int cfq_back_max = 16 * 1024;
27
/* penalty of a backwards seek */
28
static const int cfq_back_penalty = 2;
29
static const int cfq_slice_sync = HZ / 10;
30
static int cfq_slice_async = HZ / 25;
31
static const int cfq_slice_async_rq = 2;
32
static int cfq_slice_idle = HZ / 125;
33
static int cfq_group_idle = HZ / 125;
34
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
35
static const int cfq_hist_divisor = 4;
36

37
/*
38
 * offset from end of service tree
39
 */
40
#define CFQ_IDLE_DELAY		(HZ / 5)
41

42
/*
43
 * below this threshold, we consider thinktime immediate
44
 */
45
#define CFQ_MIN_TT		(2)
46

47
#define CFQ_SLICE_SCALE		(5)
48
#define CFQ_HW_QUEUE_MIN	(5)
49
#define CFQ_SERVICE_SHIFT       12
50

51
#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
52
#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
53
#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
54
#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
55

56
#define RQ_CIC(rq)		\
57
	((struct cfq_io_context *) (rq)->elevator_private[0])
58
#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
59
#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
60

61
static struct kmem_cache *cfq_pool;
62
static struct kmem_cache *cfq_ioc_pool;
63

64
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65
static struct completion *ioc_gone;
66
static DEFINE_SPINLOCK(ioc_gone_lock);
67

68
static DEFINE_SPINLOCK(cic_index_lock);
69
static DEFINE_IDA(cic_index_ida);
70

71
#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
72
#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
73
#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
74

75
#define sample_valid(samples)	((samples) > 80)
76
#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
77

78
/*
79
 * Most of our rbtree usage is for sorting with min extraction, so
80
 * if we cache the leftmost node we don't have to walk down the tree
81
 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82
 * move this into the elevator for the rq sorting as well.
83
 */
84
struct cfq_rb_root {
85
	struct rb_root rb;
86
	struct rb_node *left;
87
	unsigned count;
88
	unsigned total_weight;
89
	u64 min_vdisktime;
90
};
91
#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
92
			.count = 0, .min_vdisktime = 0, }
93

94
/*
95
 * Per process-grouping structure
96
 */
97
struct cfq_queue {
98
	/* reference count */
99
	int ref;
100
	/* various state flags, see below */
101
	unsigned int flags;
102
	/* parent cfq_data */
103
	struct cfq_data *cfqd;
104
	/* service_tree member */
105
	struct rb_node rb_node;
106
	/* service_tree key */
107
	unsigned long rb_key;
108
	/* prio tree member */
109
	struct rb_node p_node;
110
	/* prio tree root we belong to, if any */
111
	struct rb_root *p_root;
112
	/* sorted list of pending requests */
113
	struct rb_root sort_list;
114
	/* if fifo isn't expired, next request to serve */
115
	struct request *next_rq;
116
	/* requests queued in sort_list */
117
	int queued[2];
118
	/* currently allocated requests */
119
	int allocated[2];
120
	/* fifo list of requests in sort_list */
121
	struct list_head fifo;
122

123
	/* time when queue got scheduled in to dispatch first request. */
124
	unsigned long dispatch_start;
125
	unsigned int allocated_slice;
126
	unsigned int slice_dispatch;
127
	/* time when first request from queue completed and slice started. */
128
	unsigned long slice_start;
129
	unsigned long slice_end;
130
	long slice_resid;
131

132
	/* pending metadata requests */
133
	int meta_pending;
134
	/* number of requests that are on the dispatch list or inside driver */
135
	int dispatched;
136

137
	/* io prio of this group */
138
	unsigned short ioprio, org_ioprio;
139
	unsigned short ioprio_class, org_ioprio_class;
140

141
	pid_t pid;
142

143
	u32 seek_history;
144
	sector_t last_request_pos;
145

146
	struct cfq_rb_root *service_tree;
147
	struct cfq_queue *new_cfqq;
148
	struct cfq_group *cfqg;
149
	/* Number of sectors dispatched from queue in single dispatch round */
150
	unsigned long nr_sectors;
151
};
152

153
/*
154
 * First index in the service_trees.
155
 * IDLE is handled separately, so it has negative index
156
 */
157
enum wl_prio_t {
158
	BE_WORKLOAD = 0,
159
	RT_WORKLOAD = 1,
160
	IDLE_WORKLOAD = 2,
161
	CFQ_PRIO_NR,
162
};
163

164
/*
165
 * Second index in the service_trees.
166
 */
167
enum wl_type_t {
168
	ASYNC_WORKLOAD = 0,
169
	SYNC_NOIDLE_WORKLOAD = 1,
170
	SYNC_WORKLOAD = 2
171
};
172

173
/* This is per cgroup per device grouping structure */
174
struct cfq_group {
175
	/* group service_tree member */
176
	struct rb_node rb_node;
177

178
	/* group service_tree key */
179
	u64 vdisktime;
180
	unsigned int weight;
181
	unsigned int new_weight;
182
	bool needs_update;
183

184
	/* number of cfqq currently on this group */
185
	int nr_cfqq;
186

187
	/*
188
	 * Per group busy queues average. Useful for workload slice calc. We
189
	 * create the array for each prio class but at run time it is used
190
	 * only for RT and BE class and slot for IDLE class remains unused.
191
	 * This is primarily done to avoid confusion and a gcc warning.
192
	 */
193
	unsigned int busy_queues_avg[CFQ_PRIO_NR];
194
	/*
195
	 * rr lists of queues with requests. We maintain service trees for
196
	 * RT and BE classes. These trees are subdivided in subclasses
197
	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
198
	 * class there is no subclassification and all the cfq queues go on
199
	 * a single tree service_tree_idle.
200
	 * Counts are embedded in the cfq_rb_root
201
	 */
202
	struct cfq_rb_root service_trees[2][3];
203
	struct cfq_rb_root service_tree_idle;
204

205
	unsigned long saved_workload_slice;
206
	enum wl_type_t saved_workload;
207
	enum wl_prio_t saved_serving_prio;
208
	struct blkio_group blkg;
209
#ifdef CONFIG_CFQ_GROUP_IOSCHED
210
	struct hlist_node cfqd_node;
211
	int ref;
212
#endif
213
	/* number of requests that are on the dispatch list or inside driver */
214
	int dispatched;
215
};
216

217
/*
218
 * Per block device queue structure
219
 */
220
struct cfq_data {
221
	struct request_queue *queue;
222
	/* Root service tree for cfq_groups */
223
	struct cfq_rb_root grp_service_tree;
224
	struct cfq_group root_group;
225

226
	/*
227
	 * The priority currently being served
228
	 */
229
	enum wl_prio_t serving_prio;
230
	enum wl_type_t serving_type;
231
	unsigned long workload_expires;
232
	struct cfq_group *serving_group;
233

234
	/*
235
	 * Each priority tree is sorted by next_request position.  These
236
	 * trees are used when determining if two or more queues are
237
	 * interleaving requests (see cfq_close_cooperator).
238
	 */
239
	struct rb_root prio_trees[CFQ_PRIO_LISTS];
240

241
	unsigned int busy_queues;
242
	unsigned int busy_sync_queues;
243

244
	int rq_in_driver;
245
	int rq_in_flight[2];
246

247
	/*
248
	 * queue-depth detection
249
	 */
250
	int rq_queued;
251
	int hw_tag;
252
	/*
253
	 * hw_tag can be
254
	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
255
	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
256
	 *  0 => no NCQ
257
	 */
258
	int hw_tag_est_depth;
259
	unsigned int hw_tag_samples;
260

261
	/*
262
	 * idle window management
263
	 */
264
	struct timer_list idle_slice_timer;
265
	struct work_struct unplug_work;
266

267
	struct cfq_queue *active_queue;
268
	struct cfq_io_context *active_cic;
269

270
	/*
271
	 * async queue for each priority case
272
	 */
273
	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
274
	struct cfq_queue *async_idle_cfqq;
275

276
	sector_t last_position;
277

278
	/*
279
	 * tunables, see top of file
280
	 */
281
	unsigned int cfq_quantum;
282
	unsigned int cfq_fifo_expire[2];
283
	unsigned int cfq_back_penalty;
284
	unsigned int cfq_back_max;
285
	unsigned int cfq_slice[2];
286
	unsigned int cfq_slice_async_rq;
287
	unsigned int cfq_slice_idle;
288
	unsigned int cfq_group_idle;
289
	unsigned int cfq_latency;
290

291
	unsigned int cic_index;
292
	struct list_head cic_list;
293

294
	/*
295
	 * Fallback dummy cfqq for extreme OOM conditions
296
	 */
297
	struct cfq_queue oom_cfqq;
298

299
	unsigned long last_delayed_sync;
300

301
	/* List of cfq groups being managed on this device*/
302
	struct hlist_head cfqg_list;
303

304
	/* Number of groups which are on blkcg->blkg_list */
305
	unsigned int nr_blkcg_linked_grps;
306
};
307

308
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
309

310
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
311
					    enum wl_prio_t prio,
312
					    enum wl_type_t type)
313
{
314
	if (!cfqg)
315
		return NULL;
316

317
	if (prio == IDLE_WORKLOAD)
318
		return &cfqg->service_tree_idle;
319

320
	return &cfqg->service_trees[prio][type];
321
}
322

323
enum cfqq_state_flags {
324
	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
325
	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
326
	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
327
	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
328
	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
329
	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
330
	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
331
	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
332
	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
333
	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
334
	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
335
	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
336
	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
337
};
338

339
#define CFQ_CFQQ_FNS(name)						\
340
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
341
{									\
342
	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
343
}									\
344
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
345
{									\
346
	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
347
}									\
348
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
349
{									\
350
	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
351
}
352

353
CFQ_CFQQ_FNS(on_rr);
354
CFQ_CFQQ_FNS(wait_request);
355
CFQ_CFQQ_FNS(must_dispatch);
356
CFQ_CFQQ_FNS(must_alloc_slice);
357
CFQ_CFQQ_FNS(fifo_expire);
358
CFQ_CFQQ_FNS(idle_window);
359
CFQ_CFQQ_FNS(prio_changed);
360
CFQ_CFQQ_FNS(slice_new);
361
CFQ_CFQQ_FNS(sync);
362
CFQ_CFQQ_FNS(coop);
363
CFQ_CFQQ_FNS(split_coop);
364
CFQ_CFQQ_FNS(deep);
365
CFQ_CFQQ_FNS(wait_busy);
366
#undef CFQ_CFQQ_FNS
367

368
#ifdef CONFIG_CFQ_GROUP_IOSCHED
369
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
370
	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
371
			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
372
			blkg_path(&(cfqq)->cfqg->blkg), ##args)
373

374
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
375
	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
376
				blkg_path(&(cfqg)->blkg), ##args)       \
377

378
#else
379
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
380
	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
381
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
382
#endif
383
#define cfq_log(cfqd, fmt, args...)	\
384
	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
385

386
/* Traverses through cfq group service trees */
387
#define for_each_cfqg_st(cfqg, i, j, st) \
388
	for (i = 0; i <= IDLE_WORKLOAD; i++) \
389
		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
390
			: &cfqg->service_tree_idle; \
391
			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
392
			(i == IDLE_WORKLOAD && j == 0); \
393
			j++, st = i < IDLE_WORKLOAD ? \
394
			&cfqg->service_trees[i][j]: NULL) \
395

396

397
static inline bool iops_mode(struct cfq_data *cfqd)
398
{
399
	/*
400
	 * If we are not idling on queues and it is a NCQ drive, parallel
401
	 * execution of requests is on and measuring time is not possible
402
	 * in most of the cases until and unless we drive shallower queue
403
	 * depths and that becomes a performance bottleneck. In such cases
404
	 * switch to start providing fairness in terms of number of IOs.
405
	 */
406
	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
407
		return true;
408
	else
409
		return false;
410
}
411

412
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
413
{
414
	if (cfq_class_idle(cfqq))
415
		return IDLE_WORKLOAD;
416
	if (cfq_class_rt(cfqq))
417
		return RT_WORKLOAD;
418
	return BE_WORKLOAD;
419
}
420

421

422
static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
423
{
424
	if (!cfq_cfqq_sync(cfqq))
425
		return ASYNC_WORKLOAD;
426
	if (!cfq_cfqq_idle_window(cfqq))
427
		return SYNC_NOIDLE_WORKLOAD;
428
	return SYNC_WORKLOAD;
429
}
430

431
static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
432
					struct cfq_data *cfqd,
433
					struct cfq_group *cfqg)
434
{
435
	if (wl == IDLE_WORKLOAD)
436
		return cfqg->service_tree_idle.count;
437

438
	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
439
		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
440
		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
441
}
442

443
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
444
					struct cfq_group *cfqg)
445
{
446
	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
447
		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
448
}
449

450
static void cfq_dispatch_insert(struct request_queue *, struct request *);
451
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
452
				       struct io_context *, gfp_t);
453
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
454
						struct io_context *);
455

456
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
457
					    bool is_sync)
458
{
459
	return cic->cfqq[is_sync];
460
}
461

462
static inline void cic_set_cfqq(struct cfq_io_context *cic,
463
				struct cfq_queue *cfqq, bool is_sync)
464
{
465
	cic->cfqq[is_sync] = cfqq;
466
}
467

468
#define CIC_DEAD_KEY	1ul
469
#define CIC_DEAD_INDEX_SHIFT	1
470

471
static inline void *cfqd_dead_key(struct cfq_data *cfqd)
472
{
473
	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
474
}
475

476
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
477
{
478
	struct cfq_data *cfqd = cic->key;
479

480
	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
481
		return NULL;
482

483
	return cfqd;
484
}
485

486
/*
487
 * We regard a request as SYNC, if it's either a read or has the SYNC bit
488
 * set (in which case it could also be direct WRITE).
489
 */
490
static inline bool cfq_bio_sync(struct bio *bio)
491
{
492
	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
493
}
494

495
/*
496
 * scheduler run of queue, if there are requests pending and no one in the
497
 * driver that will restart queueing
498
 */
499
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
500
{
501
	if (cfqd->busy_queues) {
502
		cfq_log(cfqd, "schedule dispatch");
503
		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
504
	}
505
}
506

507
/*
508
 * Scale schedule slice based on io priority. Use the sync time slice only
509
 * if a queue is marked sync and has sync io queued. A sync queue with async
510
 * io only, should not get full sync slice length.
511
 */
512
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
513
				 unsigned short prio)
514
{
515
	const int base_slice = cfqd->cfq_slice[sync];
516

517
	WARN_ON(prio >= IOPRIO_BE_NR);
518

519
	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
520
}
521

522
static inline int
523
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
524
{
525
	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
526
}
527

528
static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
529
{
530
	u64 d = delta << CFQ_SERVICE_SHIFT;
531

532
	d = d * BLKIO_WEIGHT_DEFAULT;
533
	do_div(d, cfqg->weight);
534
	return d;
535
}
536

537
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
538
{
539
	s64 delta = (s64)(vdisktime - min_vdisktime);
540
	if (delta > 0)
541
		min_vdisktime = vdisktime;
542

543
	return min_vdisktime;
544
}
545

546
static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
547
{
548
	s64 delta = (s64)(vdisktime - min_vdisktime);
549
	if (delta < 0)
550
		min_vdisktime = vdisktime;
551

552
	return min_vdisktime;
553
}
554

555
static void update_min_vdisktime(struct cfq_rb_root *st)
556
{
557
	struct cfq_group *cfqg;
558

559
	if (st->left) {
560
		cfqg = rb_entry_cfqg(st->left);
561
		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
562
						  cfqg->vdisktime);
563
	}
564
}
565

566
/*
567
 * get averaged number of queues of RT/BE priority.
568
 * average is updated, with a formula that gives more weight to higher numbers,
569
 * to quickly follows sudden increases and decrease slowly
570
 */
571

572
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
573
					struct cfq_group *cfqg, bool rt)
574
{
575
	unsigned min_q, max_q;
576
	unsigned mult  = cfq_hist_divisor - 1;
577
	unsigned round = cfq_hist_divisor / 2;
578
	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
579

580
	min_q = min(cfqg->busy_queues_avg[rt], busy);
581
	max_q = max(cfqg->busy_queues_avg[rt], busy);
582
	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
583
		cfq_hist_divisor;
584
	return cfqg->busy_queues_avg[rt];
585
}
586

587
static inline unsigned
588
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
589
{
590
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
591

592
	return cfq_target_latency * cfqg->weight / st->total_weight;
593
}
594

595
static inline unsigned
596
cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
597
{
598
	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
599
	if (cfqd->cfq_latency) {
600
		/*
601
		 * interested queues (we consider only the ones with the same
602
		 * priority class in the cfq group)
603
		 */
604
		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
605
						cfq_class_rt(cfqq));
606
		unsigned sync_slice = cfqd->cfq_slice[1];
607
		unsigned expect_latency = sync_slice * iq;
608
		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
609

610
		if (expect_latency > group_slice) {
611
			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
612
			/* scale low_slice according to IO priority
613
			 * and sync vs async */
614
			unsigned low_slice =
615
				min(slice, base_low_slice * slice / sync_slice);
616
			/* the adapted slice value is scaled to fit all iqs
617
			 * into the target latency */
618
			slice = max(slice * group_slice / expect_latency,
619
				    low_slice);
620
		}
621
	}
622
	return slice;
623
}
624

625
static inline void
626
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
627
{
628
	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
629

630
	cfqq->slice_start = jiffies;
631
	cfqq->slice_end = jiffies + slice;
632
	cfqq->allocated_slice = slice;
633
	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
634
}
635

636
/*
637
 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
638
 * isn't valid until the first request from the dispatch is activated
639
 * and the slice time set.
640
 */
641
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
642
{
643
	if (cfq_cfqq_slice_new(cfqq))
644
		return false;
645
	if (time_before(jiffies, cfqq->slice_end))
646
		return false;
647

648
	return true;
649
}
650

651
/*
652
 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
653
 * We choose the request that is closest to the head right now. Distance
654
 * behind the head is penalized and only allowed to a certain extent.
655
 */
656
static struct request *
657
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
658
{
659
	sector_t s1, s2, d1 = 0, d2 = 0;
660
	unsigned long back_max;
661
#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
662
#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
663
	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
664

665
	if (rq1 == NULL || rq1 == rq2)
666
		return rq2;
667
	if (rq2 == NULL)
668
		return rq1;
669

670
	if (rq_is_sync(rq1) != rq_is_sync(rq2))
671
		return rq_is_sync(rq1) ? rq1 : rq2;
672

673
	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
674
		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
675

676
	s1 = blk_rq_pos(rq1);
677
	s2 = blk_rq_pos(rq2);
678

679
	/*
680
	 * by definition, 1KiB is 2 sectors
681
	 */
682
	back_max = cfqd->cfq_back_max * 2;
683

684
	/*
685
	 * Strict one way elevator _except_ in the case where we allow
686
	 * short backward seeks which are biased as twice the cost of a
687
	 * similar forward seek.
688
	 */
689
	if (s1 >= last)
690
		d1 = s1 - last;
691
	else if (s1 + back_max >= last)
692
		d1 = (last - s1) * cfqd->cfq_back_penalty;
693
	else
694
		wrap |= CFQ_RQ1_WRAP;
695

696
	if (s2 >= last)
697
		d2 = s2 - last;
698
	else if (s2 + back_max >= last)
699
		d2 = (last - s2) * cfqd->cfq_back_penalty;
700
	else
701
		wrap |= CFQ_RQ2_WRAP;
702

703
	/* Found required data */
704

705
	/*
706
	 * By doing switch() on the bit mask "wrap" we avoid having to
707
	 * check two variables for all permutations: --> faster!
708
	 */
709
	switch (wrap) {
710
	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
711
		if (d1 < d2)
712
			return rq1;
713
		else if (d2 < d1)
714
			return rq2;
715
		else {
716
			if (s1 >= s2)
717
				return rq1;
718
			else
719
				return rq2;
720
		}
721

722
	case CFQ_RQ2_WRAP:
723
		return rq1;
724
	case CFQ_RQ1_WRAP:
725
		return rq2;
726
	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
727
	default:
728
		/*
729
		 * Since both rqs are wrapped,
730
		 * start with the one that's further behind head
731
		 * (--> only *one* back seek required),
732
		 * since back seek takes more time than forward.
733
		 */
734
		if (s1 <= s2)
735
			return rq1;
736
		else
737
			return rq2;
738
	}
739
}
740

741
/*
742
 * The below is leftmost cache rbtree addon
743
 */
744
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
745
{
746
	/* Service tree is empty */
747
	if (!root->count)
748
		return NULL;
749

750
	if (!root->left)
751
		root->left = rb_first(&root->rb);
752

753
	if (root->left)
754
		return rb_entry(root->left, struct cfq_queue, rb_node);
755

756
	return NULL;
757
}
758

759
static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
760
{
761
	if (!root->left)
762
		root->left = rb_first(&root->rb);
763

764
	if (root->left)
765
		return rb_entry_cfqg(root->left);
766

767
	return NULL;
768
}
769

770
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
771
{
772
	rb_erase(n, root);
773
	RB_CLEAR_NODE(n);
774
}
775

776
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
777
{
778
	if (root->left == n)
779
		root->left = NULL;
780
	rb_erase_init(n, &root->rb);
781
	--root->count;
782
}
783

784
/*
785
 * would be nice to take fifo expire time into account as well
786
 */
787
static struct request *
788
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
789
		  struct request *last)
790
{
791
	struct rb_node *rbnext = rb_next(&last->rb_node);
792
	struct rb_node *rbprev = rb_prev(&last->rb_node);
793
	struct request *next = NULL, *prev = NULL;
794

795
	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
796

797
	if (rbprev)
798
		prev = rb_entry_rq(rbprev);
799

800
	if (rbnext)
801
		next = rb_entry_rq(rbnext);
802
	else {
803
		rbnext = rb_first(&cfqq->sort_list);
804
		if (rbnext && rbnext != &last->rb_node)
805
			next = rb_entry_rq(rbnext);
806
	}
807

808
	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
809
}
810

811
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
812
				      struct cfq_queue *cfqq)
813
{
814
	/*
815
	 * just an approximation, should be ok.
816
	 */
817
	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
818
		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
819
}
820

821
static inline s64
822
cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
823
{
824
	return cfqg->vdisktime - st->min_vdisktime;
825
}
826

827
static void
828
__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
829
{
830
	struct rb_node **node = &st->rb.rb_node;
831
	struct rb_node *parent = NULL;
832
	struct cfq_group *__cfqg;
833
	s64 key = cfqg_key(st, cfqg);
834
	int left = 1;
835

836
	while (*node != NULL) {
837
		parent = *node;
838
		__cfqg = rb_entry_cfqg(parent);
839

840
		if (key < cfqg_key(st, __cfqg))
841
			node = &parent->rb_left;
842
		else {
843
			node = &parent->rb_right;
844
			left = 0;
845
		}
846
	}
847

848
	if (left)
849
		st->left = &cfqg->rb_node;
850

851
	rb_link_node(&cfqg->rb_node, parent, node);
852
	rb_insert_color(&cfqg->rb_node, &st->rb);
853
}
854

855
static void
856
cfq_update_group_weight(struct cfq_group *cfqg)
857
{
858
	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
859
	if (cfqg->needs_update) {
860
		cfqg->weight = cfqg->new_weight;
861
		cfqg->needs_update = false;
862
	}
863
}
864

865
static void
866
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
867
{
868
	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
869

870
	cfq_update_group_weight(cfqg);
871
	__cfq_group_service_tree_add(st, cfqg);
872
	st->total_weight += cfqg->weight;
873
}
874

875
static void
876
cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
877
{
878
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
879
	struct cfq_group *__cfqg;
880
	struct rb_node *n;
881

882
	cfqg->nr_cfqq++;
883
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
884
		return;
885

886
	/*
887
	 * Currently put the group at the end. Later implement something
888
	 * so that groups get lesser vtime based on their weights, so that
889
	 * if group does not loose all if it was not continuously backlogged.
890
	 */
891
	n = rb_last(&st->rb);
892
	if (n) {
893
		__cfqg = rb_entry_cfqg(n);
894
		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
895
	} else
896
		cfqg->vdisktime = st->min_vdisktime;
897
	cfq_group_service_tree_add(st, cfqg);
898
}
899

900
static void
901
cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
902
{
903
	st->total_weight -= cfqg->weight;
904
	if (!RB_EMPTY_NODE(&cfqg->rb_node))
905
		cfq_rb_erase(&cfqg->rb_node, st);
906
}
907

908
static void
909
cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
910
{
911
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
912

913
	BUG_ON(cfqg->nr_cfqq < 1);
914
	cfqg->nr_cfqq--;
915

916
	/* If there are other cfq queues under this group, don't delete it */
917
	if (cfqg->nr_cfqq)
918
		return;
919

920
	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
921
	cfq_group_service_tree_del(st, cfqg);
922
	cfqg->saved_workload_slice = 0;
923
	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
924
}
925

926
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
927
						unsigned int *unaccounted_time)
928
{
929
	unsigned int slice_used;
930

931
	/*
932
	 * Queue got expired before even a single request completed or
933
	 * got expired immediately after first request completion.
934
	 */
935
	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
936
		/*
937
		 * Also charge the seek time incurred to the group, otherwise
938
		 * if there are mutiple queues in the group, each can dispatch
939
		 * a single request on seeky media and cause lots of seek time
940
		 * and group will never know it.
941
		 */
942
		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
943
					1);
944
	} else {
945
		slice_used = jiffies - cfqq->slice_start;
946
		if (slice_used > cfqq->allocated_slice) {
947
			*unaccounted_time = slice_used - cfqq->allocated_slice;
948
			slice_used = cfqq->allocated_slice;
949
		}
950
		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
951
			*unaccounted_time += cfqq->slice_start -
952
					cfqq->dispatch_start;
953
	}
954

955
	return slice_used;
956
}
957

958
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
959
				struct cfq_queue *cfqq)
960
{
961
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
962
	unsigned int used_sl, charge, unaccounted_sl = 0;
963
	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
964
			- cfqg->service_tree_idle.count;
965

966
	BUG_ON(nr_sync < 0);
967
	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
968

969
	if (iops_mode(cfqd))
970
		charge = cfqq->slice_dispatch;
971
	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
972
		charge = cfqq->allocated_slice;
973

974
	/* Can't update vdisktime while group is on service tree */
975
	cfq_group_service_tree_del(st, cfqg);
976
	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
977
	/* If a new weight was requested, update now, off tree */
978
	cfq_group_service_tree_add(st, cfqg);
979

980
	/* This group is being expired. Save the context */
981
	if (time_after(cfqd->workload_expires, jiffies)) {
982
		cfqg->saved_workload_slice = cfqd->workload_expires
983
						- jiffies;
984
		cfqg->saved_workload = cfqd->serving_type;
985
		cfqg->saved_serving_prio = cfqd->serving_prio;
986
	} else
987
		cfqg->saved_workload_slice = 0;
988

989
	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
990
					st->min_vdisktime);
991
	cfq_log_cfqq(cfqq->cfqd, cfqq,
992
		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
993
		     used_sl, cfqq->slice_dispatch, charge,
994
		     iops_mode(cfqd), cfqq->nr_sectors);
995
	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
996
					  unaccounted_sl);
997
	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
998
}
999

1000
#ifdef CONFIG_CFQ_GROUP_IOSCHED
1001
static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
1002
{
1003
	if (blkg)
1004
		return container_of(blkg, struct cfq_group, blkg);
1005
	return NULL;
1006
}
1007

1008
void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1009
					unsigned int weight)
1010
{
1011
	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1012
	cfqg->new_weight = weight;
1013
	cfqg->needs_update = true;
1014
}
1015

1016
static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1017
			struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1018
{
1019
	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1020
	unsigned int major, minor;
1021

1022
	/*
1023
	 * Add group onto cgroup list. It might happen that bdi->dev is
1024
	 * not initialized yet. Initialize this new group without major
1025
	 * and minor info and this info will be filled in once a new thread
1026
	 * comes for IO.
1027
	 */
1028
	if (bdi->dev) {
1029
		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1030
		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1031
					(void *)cfqd, MKDEV(major, minor));
1032
	} else
1033
		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1034
					(void *)cfqd, 0);
1035

1036
	cfqd->nr_blkcg_linked_grps++;
1037
	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1038

1039
	/* Add group on cfqd list */
1040
	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1041
}
1042

1043
/*
1044
 * Should be called from sleepable context. No request queue lock as per
1045
 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1046
 * from sleepable context.
1047
 */
1048
static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1049
{
1050
	struct cfq_group *cfqg = NULL;
1051
	int i, j, ret;
1052
	struct cfq_rb_root *st;
1053

1054
	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1055
	if (!cfqg)
1056
		return NULL;
1057

1058
	for_each_cfqg_st(cfqg, i, j, st)
1059
		*st = CFQ_RB_ROOT;
1060
	RB_CLEAR_NODE(&cfqg->rb_node);
1061

1062
	/*
1063
	 * Take the initial reference that will be released on destroy
1064
	 * This can be thought of a joint reference by cgroup and
1065
	 * elevator which will be dropped by either elevator exit
1066
	 * or cgroup deletion path depending on who is exiting first.
1067
	 */
1068
	cfqg->ref = 1;
1069

1070
	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1071
	if (ret) {
1072
		kfree(cfqg);
1073
		return NULL;
1074
	}
1075

1076
	return cfqg;
1077
}
1078

1079
static struct cfq_group *
1080
cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1081
{
1082
	struct cfq_group *cfqg = NULL;
1083
	void *key = cfqd;
1084
	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1085
	unsigned int major, minor;
1086

1087
	/*
1088
	 * This is the common case when there are no blkio cgroups.
1089
	 * Avoid lookup in this case
1090
	 */
1091
	if (blkcg == &blkio_root_cgroup)
1092
		cfqg = &cfqd->root_group;
1093
	else
1094
		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1095

1096
	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1097
		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1098
		cfqg->blkg.dev = MKDEV(major, minor);
1099
	}
1100

1101
	return cfqg;
1102
}
1103

1104
/*
1105
 * Search for the cfq group current task belongs to. request_queue lock must
1106
 * be held.
1107
 */
1108
static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1109
{
1110
	struct blkio_cgroup *blkcg;
1111
	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1112
	struct request_queue *q = cfqd->queue;
1113

1114
	rcu_read_lock();
1115
	blkcg = task_blkio_cgroup(current);
1116
	cfqg = cfq_find_cfqg(cfqd, blkcg);
1117
	if (cfqg) {
1118
		rcu_read_unlock();
1119
		return cfqg;
1120
	}
1121

1122
	/*
1123
	 * Need to allocate a group. Allocation of group also needs allocation
1124
	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1125
	 * we need to drop rcu lock and queue_lock before we call alloc.
1126
	 *
1127
	 * Not taking any queue reference here and assuming that queue is
1128
	 * around by the time we return. CFQ queue allocation code does
1129
	 * the same. It might be racy though.
1130
	 */
1131

1132
	rcu_read_unlock();
1133
	spin_unlock_irq(q->queue_lock);
1134

1135
	cfqg = cfq_alloc_cfqg(cfqd);
1136

1137
	spin_lock_irq(q->queue_lock);
1138

1139
	rcu_read_lock();
1140
	blkcg = task_blkio_cgroup(current);
1141

1142
	/*
1143
	 * If some other thread already allocated the group while we were
1144
	 * not holding queue lock, free up the group
1145
	 */
1146
	__cfqg = cfq_find_cfqg(cfqd, blkcg);
1147

1148
	if (__cfqg) {
1149
		kfree(cfqg);
1150
		rcu_read_unlock();
1151
		return __cfqg;
1152
	}
1153

1154
	if (!cfqg)
1155
		cfqg = &cfqd->root_group;
1156

1157
	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1158
	rcu_read_unlock();
1159
	return cfqg;
1160
}
1161

1162
static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1163
{
1164
	cfqg->ref++;
1165
	return cfqg;
1166
}
1167

1168
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1169
{
1170
	/* Currently, all async queues are mapped to root group */
1171
	if (!cfq_cfqq_sync(cfqq))
1172
		cfqg = &cfqq->cfqd->root_group;
1173

1174
	cfqq->cfqg = cfqg;
1175
	/* cfqq reference on cfqg */
1176
	cfqq->cfqg->ref++;
1177
}
1178

1179
static void cfq_put_cfqg(struct cfq_group *cfqg)
1180
{
1181
	struct cfq_rb_root *st;
1182
	int i, j;
1183

1184
	BUG_ON(cfqg->ref <= 0);
1185
	cfqg->ref--;
1186
	if (cfqg->ref)
1187
		return;
1188
	for_each_cfqg_st(cfqg, i, j, st)
1189
		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1190
	free_percpu(cfqg->blkg.stats_cpu);
1191
	kfree(cfqg);
1192
}
1193

1194
static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1195
{
1196
	/* Something wrong if we are trying to remove same group twice */
1197
	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1198

1199
	hlist_del_init(&cfqg->cfqd_node);
1200

1201
	/*
1202
	 * Put the reference taken at the time of creation so that when all
1203
	 * queues are gone, group can be destroyed.
1204
	 */
1205
	cfq_put_cfqg(cfqg);
1206
}
1207

1208
static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1209
{
1210
	struct hlist_node *pos, *n;
1211
	struct cfq_group *cfqg;
1212

1213
	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1214
		/*
1215
		 * If cgroup removal path got to blk_group first and removed
1216
		 * it from cgroup list, then it will take care of destroying
1217
		 * cfqg also.
1218
		 */
1219
		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1220
			cfq_destroy_cfqg(cfqd, cfqg);
1221
	}
1222
}
1223

1224
/*
1225
 * Blk cgroup controller notification saying that blkio_group object is being
1226
 * delinked as associated cgroup object is going away. That also means that
1227
 * no new IO will come in this group. So get rid of this group as soon as
1228
 * any pending IO in the group is finished.
1229
 *
1230
 * This function is called under rcu_read_lock(). key is the rcu protected
1231
 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1232
 * read lock.
1233
 *
1234
 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1235
 * it should not be NULL as even if elevator was exiting, cgroup deltion
1236
 * path got to it first.
1237
 */
1238
void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1239
{
1240
	unsigned long  flags;
1241
	struct cfq_data *cfqd = key;
1242

1243
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1244
	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1245
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1246
}
1247

1248
#else /* GROUP_IOSCHED */
1249
static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1250
{
1251
	return &cfqd->root_group;
1252
}
1253

1254
static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1255
{
1256
	return cfqg;
1257
}
1258

1259
static inline void
1260
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1261
	cfqq->cfqg = cfqg;
1262
}
1263

1264
static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1265
static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1266

1267
#endif /* GROUP_IOSCHED */
1268

1269
/*
1270
 * The cfqd->service_trees holds all pending cfq_queue's that have
1271
 * requests waiting to be processed. It is sorted in the order that
1272
 * we will service the queues.
1273
 */
1274
static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1275
				 bool add_front)
1276
{
1277
	struct rb_node **p, *parent;
1278
	struct cfq_queue *__cfqq;
1279
	unsigned long rb_key;
1280
	struct cfq_rb_root *service_tree;
1281
	int left;
1282
	int new_cfqq = 1;
1283

1284
	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1285
						cfqq_type(cfqq));
1286
	if (cfq_class_idle(cfqq)) {
1287
		rb_key = CFQ_IDLE_DELAY;
1288
		parent = rb_last(&service_tree->rb);
1289
		if (parent && parent != &cfqq->rb_node) {
1290
			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1291
			rb_key += __cfqq->rb_key;
1292
		} else
1293
			rb_key += jiffies;
1294
	} else if (!add_front) {
1295
		/*
1296
		 * Get our rb key offset. Subtract any residual slice
1297
		 * value carried from last service. A negative resid
1298
		 * count indicates slice overrun, and this should position
1299
		 * the next service time further away in the tree.
1300
		 */
1301
		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1302
		rb_key -= cfqq->slice_resid;
1303
		cfqq->slice_resid = 0;
1304
	} else {
1305
		rb_key = -HZ;
1306
		__cfqq = cfq_rb_first(service_tree);
1307
		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1308
	}
1309

1310
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1311
		new_cfqq = 0;
1312
		/*
1313
		 * same position, nothing more to do
1314
		 */
1315
		if (rb_key == cfqq->rb_key &&
1316
		    cfqq->service_tree == service_tree)
1317
			return;
1318

1319
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1320
		cfqq->service_tree = NULL;
1321
	}
1322

1323
	left = 1;
1324
	parent = NULL;
1325
	cfqq->service_tree = service_tree;
1326
	p = &service_tree->rb.rb_node;
1327
	while (*p) {
1328
		struct rb_node **n;
1329

1330
		parent = *p;
1331
		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1332

1333
		/*
1334
		 * sort by key, that represents service time.
1335
		 */
1336
		if (time_before(rb_key, __cfqq->rb_key))
1337
			n = &(*p)->rb_left;
1338
		else {
1339
			n = &(*p)->rb_right;
1340
			left = 0;
1341
		}
1342

1343
		p = n;
1344
	}
1345

1346
	if (left)
1347
		service_tree->left = &cfqq->rb_node;
1348

1349
	cfqq->rb_key = rb_key;
1350
	rb_link_node(&cfqq->rb_node, parent, p);
1351
	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1352
	service_tree->count++;
1353
	if (add_front || !new_cfqq)
1354
		return;
1355
	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1356
}
1357

1358
static struct cfq_queue *
1359
cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1360
		     sector_t sector, struct rb_node **ret_parent,
1361
		     struct rb_node ***rb_link)
1362
{
1363
	struct rb_node **p, *parent;
1364
	struct cfq_queue *cfqq = NULL;
1365

1366
	parent = NULL;
1367
	p = &root->rb_node;
1368
	while (*p) {
1369
		struct rb_node **n;
1370

1371
		parent = *p;
1372
		cfqq = rb_entry(parent, struct cfq_queue, p_node);
1373

1374
		/*
1375
		 * Sort strictly based on sector.  Smallest to the left,
1376
		 * largest to the right.
1377
		 */
1378
		if (sector > blk_rq_pos(cfqq->next_rq))
1379
			n = &(*p)->rb_right;
1380
		else if (sector < blk_rq_pos(cfqq->next_rq))
1381
			n = &(*p)->rb_left;
1382
		else
1383
			break;
1384
		p = n;
1385
		cfqq = NULL;
1386
	}
1387

1388
	*ret_parent = parent;
1389
	if (rb_link)
1390
		*rb_link = p;
1391
	return cfqq;
1392
}
1393

1394
static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1395
{
1396
	struct rb_node **p, *parent;
1397
	struct cfq_queue *__cfqq;
1398

1399
	if (cfqq->p_root) {
1400
		rb_erase(&cfqq->p_node, cfqq->p_root);
1401
		cfqq->p_root = NULL;
1402
	}
1403

1404
	if (cfq_class_idle(cfqq))
1405
		return;
1406
	if (!cfqq->next_rq)
1407
		return;
1408

1409
	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1410
	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1411
				      blk_rq_pos(cfqq->next_rq), &parent, &p);
1412
	if (!__cfqq) {
1413
		rb_link_node(&cfqq->p_node, parent, p);
1414
		rb_insert_color(&cfqq->p_node, cfqq->p_root);
1415
	} else
1416
		cfqq->p_root = NULL;
1417
}
1418

1419
/*
1420
 * Update cfqq's position in the service tree.
1421
 */
1422
static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1423
{
1424
	/*
1425
	 * Resorting requires the cfqq to be on the RR list already.
1426
	 */
1427
	if (cfq_cfqq_on_rr(cfqq)) {
1428
		cfq_service_tree_add(cfqd, cfqq, 0);
1429
		cfq_prio_tree_add(cfqd, cfqq);
1430
	}
1431
}
1432

1433
/*
1434
 * add to busy list of queues for service, trying to be fair in ordering
1435
 * the pending list according to last request service
1436
 */
1437
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1438
{
1439
	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1440
	BUG_ON(cfq_cfqq_on_rr(cfqq));
1441
	cfq_mark_cfqq_on_rr(cfqq);
1442
	cfqd->busy_queues++;
1443
	if (cfq_cfqq_sync(cfqq))
1444
		cfqd->busy_sync_queues++;
1445

1446
	cfq_resort_rr_list(cfqd, cfqq);
1447
}
1448

1449
/*
1450
 * Called when the cfqq no longer has requests pending, remove it from
1451
 * the service tree.
1452
 */
1453
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1454
{
1455
	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1456
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
1457
	cfq_clear_cfqq_on_rr(cfqq);
1458

1459
	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1460
		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1461
		cfqq->service_tree = NULL;
1462
	}
1463
	if (cfqq->p_root) {
1464
		rb_erase(&cfqq->p_node, cfqq->p_root);
1465
		cfqq->p_root = NULL;
1466
	}
1467

1468
	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1469
	BUG_ON(!cfqd->busy_queues);
1470
	cfqd->busy_queues--;
1471
	if (cfq_cfqq_sync(cfqq))
1472
		cfqd->busy_sync_queues--;
1473
}
1474

1475
/*
1476
 * rb tree support functions
1477
 */
1478
static void cfq_del_rq_rb(struct request *rq)
1479
{
1480
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1481
	const int sync = rq_is_sync(rq);
1482

1483
	BUG_ON(!cfqq->queued[sync]);
1484
	cfqq->queued[sync]--;
1485

1486
	elv_rb_del(&cfqq->sort_list, rq);
1487

1488
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1489
		/*
1490
		 * Queue will be deleted from service tree when we actually
1491
		 * expire it later. Right now just remove it from prio tree
1492
		 * as it is empty.
1493
		 */
1494
		if (cfqq->p_root) {
1495
			rb_erase(&cfqq->p_node, cfqq->p_root);
1496
			cfqq->p_root = NULL;
1497
		}
1498
	}
1499
}
1500

1501
static void cfq_add_rq_rb(struct request *rq)
1502
{
1503
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1504
	struct cfq_data *cfqd = cfqq->cfqd;
1505
	struct request *__alias, *prev;
1506

1507
	cfqq->queued[rq_is_sync(rq)]++;
1508

1509
	/*
1510
	 * looks a little odd, but the first insert might return an alias.
1511
	 * if that happens, put the alias on the dispatch list
1512
	 */
1513
	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1514
		cfq_dispatch_insert(cfqd->queue, __alias);
1515

1516
	if (!cfq_cfqq_on_rr(cfqq))
1517
		cfq_add_cfqq_rr(cfqd, cfqq);
1518

1519
	/*
1520
	 * check if this request is a better next-serve candidate
1521
	 */
1522
	prev = cfqq->next_rq;
1523
	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1524

1525
	/*
1526
	 * adjust priority tree position, if ->next_rq changes
1527
	 */
1528
	if (prev != cfqq->next_rq)
1529
		cfq_prio_tree_add(cfqd, cfqq);
1530

1531
	BUG_ON(!cfqq->next_rq);
1532
}
1533

1534
static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1535
{
1536
	elv_rb_del(&cfqq->sort_list, rq);
1537
	cfqq->queued[rq_is_sync(rq)]--;
1538
	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1539
					rq_data_dir(rq), rq_is_sync(rq));
1540
	cfq_add_rq_rb(rq);
1541
	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1542
			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1543
			rq_is_sync(rq));
1544
}
1545

1546
static struct request *
1547
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1548
{
1549
	struct task_struct *tsk = current;
1550
	struct cfq_io_context *cic;
1551
	struct cfq_queue *cfqq;
1552

1553
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
1554
	if (!cic)
1555
		return NULL;
1556

1557
	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1558
	if (cfqq) {
1559
		sector_t sector = bio->bi_sector + bio_sectors(bio);
1560

1561
		return elv_rb_find(&cfqq->sort_list, sector);
1562
	}
1563

1564
	return NULL;
1565
}
1566

1567
static void cfq_activate_request(struct request_queue *q, struct request *rq)
1568
{
1569
	struct cfq_data *cfqd = q->elevator->elevator_data;
1570

1571
	cfqd->rq_in_driver++;
1572
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1573
						cfqd->rq_in_driver);
1574

1575
	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1576
}
1577

1578
static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1579
{
1580
	struct cfq_data *cfqd = q->elevator->elevator_data;
1581

1582
	WARN_ON(!cfqd->rq_in_driver);
1583
	cfqd->rq_in_driver--;
1584
	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1585
						cfqd->rq_in_driver);
1586
}
1587

1588
static void cfq_remove_request(struct request *rq)
1589
{
1590
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1591

1592
	if (cfqq->next_rq == rq)
1593
		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1594

1595
	list_del_init(&rq->queuelist);
1596
	cfq_del_rq_rb(rq);
1597

1598
	cfqq->cfqd->rq_queued--;
1599
	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1600
					rq_data_dir(rq), rq_is_sync(rq));
1601
	if (rq->cmd_flags & REQ_META) {
1602
		WARN_ON(!cfqq->meta_pending);
1603
		cfqq->meta_pending--;
1604
	}
1605
}
1606

1607
static int cfq_merge(struct request_queue *q, struct request **req,
1608
		     struct bio *bio)
1609
{
1610
	struct cfq_data *cfqd = q->elevator->elevator_data;
1611
	struct request *__rq;
1612

1613
	__rq = cfq_find_rq_fmerge(cfqd, bio);
1614
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
1615
		*req = __rq;
1616
		return ELEVATOR_FRONT_MERGE;
1617
	}
1618

1619
	return ELEVATOR_NO_MERGE;
1620
}
1621

1622
static void cfq_merged_request(struct request_queue *q, struct request *req,
1623
			       int type)
1624
{
1625
	if (type == ELEVATOR_FRONT_MERGE) {
1626
		struct cfq_queue *cfqq = RQ_CFQQ(req);
1627

1628
		cfq_reposition_rq_rb(cfqq, req);
1629
	}
1630
}
1631

1632
static void cfq_bio_merged(struct request_queue *q, struct request *req,
1633
				struct bio *bio)
1634
{
1635
	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1636
					bio_data_dir(bio), cfq_bio_sync(bio));
1637
}
1638

1639
static void
1640
cfq_merged_requests(struct request_queue *q, struct request *rq,
1641
		    struct request *next)
1642
{
1643
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
1644
	/*
1645
	 * reposition in fifo if next is older than rq
1646
	 */
1647
	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1648
	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1649
		list_move(&rq->queuelist, &next->queuelist);
1650
		rq_set_fifo_time(rq, rq_fifo_time(next));
1651
	}
1652

1653
	if (cfqq->next_rq == next)
1654
		cfqq->next_rq = rq;
1655
	cfq_remove_request(next);
1656
	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1657
					rq_data_dir(next), rq_is_sync(next));
1658
}
1659

1660
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1661
			   struct bio *bio)
1662
{
1663
	struct cfq_data *cfqd = q->elevator->elevator_data;
1664
	struct cfq_io_context *cic;
1665
	struct cfq_queue *cfqq;
1666

1667
	/*
1668
	 * Disallow merge of a sync bio into an async request.
1669
	 */
1670
	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1671
		return false;
1672

1673
	/*
1674
	 * Lookup the cfqq that this bio will be queued with. Allow
1675
	 * merge only if rq is queued there.
1676
	 */
1677
	cic = cfq_cic_lookup(cfqd, current->io_context);
1678
	if (!cic)
1679
		return false;
1680

1681
	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1682
	return cfqq == RQ_CFQQ(rq);
1683
}
1684

1685
static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1686
{
1687
	del_timer(&cfqd->idle_slice_timer);
1688
	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1689
}
1690

1691
static void __cfq_set_active_queue(struct cfq_data *cfqd,
1692
				   struct cfq_queue *cfqq)
1693
{
1694
	if (cfqq) {
1695
		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1696
				cfqd->serving_prio, cfqd->serving_type);
1697
		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1698
		cfqq->slice_start = 0;
1699
		cfqq->dispatch_start = jiffies;
1700
		cfqq->allocated_slice = 0;
1701
		cfqq->slice_end = 0;
1702
		cfqq->slice_dispatch = 0;
1703
		cfqq->nr_sectors = 0;
1704

1705
		cfq_clear_cfqq_wait_request(cfqq);
1706
		cfq_clear_cfqq_must_dispatch(cfqq);
1707
		cfq_clear_cfqq_must_alloc_slice(cfqq);
1708
		cfq_clear_cfqq_fifo_expire(cfqq);
1709
		cfq_mark_cfqq_slice_new(cfqq);
1710

1711
		cfq_del_timer(cfqd, cfqq);
1712
	}
1713

1714
	cfqd->active_queue = cfqq;
1715
}
1716

1717
/*
1718
 * current cfqq expired its slice (or was too idle), select new one
1719
 */
1720
static void
1721
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1722
		    bool timed_out)
1723
{
1724
	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1725

1726
	if (cfq_cfqq_wait_request(cfqq))
1727
		cfq_del_timer(cfqd, cfqq);
1728

1729
	cfq_clear_cfqq_wait_request(cfqq);
1730
	cfq_clear_cfqq_wait_busy(cfqq);
1731

1732
	/*
1733
	 * If this cfqq is shared between multiple processes, check to
1734
	 * make sure that those processes are still issuing I/Os within
1735
	 * the mean seek distance.  If not, it may be time to break the
1736
	 * queues apart again.
1737
	 */
1738
	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1739
		cfq_mark_cfqq_split_coop(cfqq);
1740

1741
	/*
1742
	 * store what was left of this slice, if the queue idled/timed out
1743
	 */
1744
	if (timed_out) {
1745
		if (cfq_cfqq_slice_new(cfqq))
1746
			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1747
		else
1748
			cfqq->slice_resid = cfqq->slice_end - jiffies;
1749
		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1750
	}
1751

1752
	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1753

1754
	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1755
		cfq_del_cfqq_rr(cfqd, cfqq);
1756

1757
	cfq_resort_rr_list(cfqd, cfqq);
1758

1759
	if (cfqq == cfqd->active_queue)
1760
		cfqd->active_queue = NULL;
1761

1762
	if (cfqd->active_cic) {
1763
		put_io_context(cfqd->active_cic->ioc);
1764
		cfqd->active_cic = NULL;
1765
	}
1766
}
1767

1768
static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1769
{
1770
	struct cfq_queue *cfqq = cfqd->active_queue;
1771

1772
	if (cfqq)
1773
		__cfq_slice_expired(cfqd, cfqq, timed_out);
1774
}
1775

1776
/*
1777
 * Get next queue for service. Unless we have a queue preemption,
1778
 * we'll simply select the first cfqq in the service tree.
1779
 */
1780
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1781
{
1782
	struct cfq_rb_root *service_tree =
1783
		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1784
					cfqd->serving_type);
1785

1786
	if (!cfqd->rq_queued)
1787
		return NULL;
1788

1789
	/* There is nothing to dispatch */
1790
	if (!service_tree)
1791
		return NULL;
1792
	if (RB_EMPTY_ROOT(&service_tree->rb))
1793
		return NULL;
1794
	return cfq_rb_first(service_tree);
1795
}
1796

1797
static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1798
{
1799
	struct cfq_group *cfqg;
1800
	struct cfq_queue *cfqq;
1801
	int i, j;
1802
	struct cfq_rb_root *st;
1803

1804
	if (!cfqd->rq_queued)
1805
		return NULL;
1806

1807
	cfqg = cfq_get_next_cfqg(cfqd);
1808
	if (!cfqg)
1809
		return NULL;
1810

1811
	for_each_cfqg_st(cfqg, i, j, st)
1812
		if ((cfqq = cfq_rb_first(st)) != NULL)
1813
			return cfqq;
1814
	return NULL;
1815
}
1816

1817
/*
1818
 * Get and set a new active queue for service.
1819
 */
1820
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1821
					      struct cfq_queue *cfqq)
1822
{
1823
	if (!cfqq)
1824
		cfqq = cfq_get_next_queue(cfqd);
1825

1826
	__cfq_set_active_queue(cfqd, cfqq);
1827
	return cfqq;
1828
}
1829

1830
static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1831
					  struct request *rq)
1832
{
1833
	if (blk_rq_pos(rq) >= cfqd->last_position)
1834
		return blk_rq_pos(rq) - cfqd->last_position;
1835
	else
1836
		return cfqd->last_position - blk_rq_pos(rq);
1837
}
1838

1839
static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1840
			       struct request *rq)
1841
{
1842
	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1843
}
1844

1845
static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1846
				    struct cfq_queue *cur_cfqq)
1847
{
1848
	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1849
	struct rb_node *parent, *node;
1850
	struct cfq_queue *__cfqq;
1851
	sector_t sector = cfqd->last_position;
1852

1853
	if (RB_EMPTY_ROOT(root))
1854
		return NULL;
1855

1856
	/*
1857
	 * First, if we find a request starting at the end of the last
1858
	 * request, choose it.
1859
	 */
1860
	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1861
	if (__cfqq)
1862
		return __cfqq;
1863

1864
	/*
1865
	 * If the exact sector wasn't found, the parent of the NULL leaf
1866
	 * will contain the closest sector.
1867
	 */
1868
	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
1869
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1870
		return __cfqq;
1871

1872
	if (blk_rq_pos(__cfqq->next_rq) < sector)
1873
		node = rb_next(&__cfqq->p_node);
1874
	else
1875
		node = rb_prev(&__cfqq->p_node);
1876
	if (!node)
1877
		return NULL;
1878

1879
	__cfqq = rb_entry(node, struct cfq_queue, p_node);
1880
	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1881
		return __cfqq;
1882

1883
	return NULL;
1884
}
1885

1886
/*
1887
 * cfqd - obvious
1888
 * cur_cfqq - passed in so that we don't decide that the current queue is
1889
 * 	      closely cooperating with itself.
1890
 *
1891
 * So, basically we're assuming that that cur_cfqq has dispatched at least
1892
 * one request, and that cfqd->last_position reflects a position on the disk
1893
 * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1894
 * assumption.
1895
 */
1896
static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1897
					      struct cfq_queue *cur_cfqq)
1898
{
1899
	struct cfq_queue *cfqq;
1900

1901
	if (cfq_class_idle(cur_cfqq))
1902
		return NULL;
1903
	if (!cfq_cfqq_sync(cur_cfqq))
1904
		return NULL;
1905
	if (CFQQ_SEEKY(cur_cfqq))
1906
		return NULL;
1907

1908
	/*
1909
	 * Don't search priority tree if it's the only queue in the group.
1910
	 */
1911
	if (cur_cfqq->cfqg->nr_cfqq == 1)
1912
		return NULL;
1913

1914
	/*
1915
	 * We should notice if some of the queues are cooperating, eg
1916
	 * working closely on the same area of the disk. In that case,
1917
	 * we can group them together and don't waste time idling.
1918
	 */
1919
	cfqq = cfqq_close(cfqd, cur_cfqq);
1920
	if (!cfqq)
1921
		return NULL;
1922

1923
	/* If new queue belongs to different cfq_group, don't choose it */
1924
	if (cur_cfqq->cfqg != cfqq->cfqg)
1925
		return NULL;
1926

1927
	/*
1928
	 * It only makes sense to merge sync queues.
1929
	 */
1930
	if (!cfq_cfqq_sync(cfqq))
1931
		return NULL;
1932
	if (CFQQ_SEEKY(cfqq))
1933
		return NULL;
1934

1935
	/*
1936
	 * Do not merge queues of different priority classes
1937
	 */
1938
	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1939
		return NULL;
1940

1941
	return cfqq;
1942
}
1943

1944
/*
1945
 * Determine whether we should enforce idle window for this queue.
1946
 */
1947

1948
static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1949
{
1950
	enum wl_prio_t prio = cfqq_prio(cfqq);
1951
	struct cfq_rb_root *service_tree = cfqq->service_tree;
1952

1953
	BUG_ON(!service_tree);
1954
	BUG_ON(!service_tree->count);
1955

1956
	if (!cfqd->cfq_slice_idle)
1957
		return false;
1958

1959
	/* We never do for idle class queues. */
1960
	if (prio == IDLE_WORKLOAD)
1961
		return false;
1962

1963
	/* We do for queues that were marked with idle window flag. */
1964
	if (cfq_cfqq_idle_window(cfqq) &&
1965
	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1966
		return true;
1967

1968
	/*
1969
	 * Otherwise, we do only if they are the last ones
1970
	 * in their service tree.
1971
	 */
1972
	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1973
		return true;
1974
	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1975
			service_tree->count);
1976
	return false;
1977
}
1978

1979
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1980
{
1981
	struct cfq_queue *cfqq = cfqd->active_queue;
1982
	struct cfq_io_context *cic;
1983
	unsigned long sl, group_idle = 0;
1984

1985
	/*
1986
	 * SSD device without seek penalty, disable idling. But only do so
1987
	 * for devices that support queuing, otherwise we still have a problem
1988
	 * with sync vs async workloads.
1989
	 */
1990
	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1991
		return;
1992

1993
	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1994
	WARN_ON(cfq_cfqq_slice_new(cfqq));
1995

1996
	/*
1997
	 * idle is disabled, either manually or by past process history
1998
	 */
1999
	if (!cfq_should_idle(cfqd, cfqq)) {
2000
		/* no queue idling. Check for group idling */
2001
		if (cfqd->cfq_group_idle)
2002
			group_idle = cfqd->cfq_group_idle;
2003
		else
2004
			return;
2005
	}
2006

2007
	/*
2008
	 * still active requests from this queue, don't idle
2009
	 */
2010
	if (cfqq->dispatched)
2011
		return;
2012

2013
	/*
2014
	 * task has exited, don't wait
2015
	 */
2016
	cic = cfqd->active_cic;
2017
	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
2018
		return;
2019

2020
	/*
2021
	 * If our average think time is larger than the remaining time
2022
	 * slice, then don't idle. This avoids overrunning the allotted
2023
	 * time slice.
2024
	 */
2025
	if (sample_valid(cic->ttime_samples) &&
2026
	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
2027
		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2028
			     cic->ttime_mean);
2029
		return;
2030
	}
2031

2032
	/* There are other queues in the group, don't do group idle */
2033
	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2034
		return;
2035

2036
	cfq_mark_cfqq_wait_request(cfqq);
2037

2038
	if (group_idle)
2039
		sl = cfqd->cfq_group_idle;
2040
	else
2041
		sl = cfqd->cfq_slice_idle;
2042

2043
	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2044
	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
2045
	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2046
			group_idle ? 1 : 0);
2047
}
2048

2049
/*
2050
 * Move request from internal lists to the request queue dispatch list.
2051
 */
2052
static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2053
{
2054
	struct cfq_data *cfqd = q->elevator->elevator_data;
2055
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
2056

2057
	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2058

2059
	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2060
	cfq_remove_request(rq);
2061
	cfqq->dispatched++;
2062
	(RQ_CFQG(rq))->dispatched++;
2063
	elv_dispatch_sort(q, rq);
2064

2065
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2066
	cfqq->nr_sectors += blk_rq_sectors(rq);
2067
	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
2068
					rq_data_dir(rq), rq_is_sync(rq));
2069
}
2070

2071
/*
2072
 * return expired entry, or NULL to just start from scratch in rbtree
2073
 */
2074
static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2075
{
2076
	struct request *rq = NULL;
2077

2078
	if (cfq_cfqq_fifo_expire(cfqq))
2079
		return NULL;
2080

2081
	cfq_mark_cfqq_fifo_expire(cfqq);
2082

2083
	if (list_empty(&cfqq->fifo))
2084
		return NULL;
2085

2086
	rq = rq_entry_fifo(cfqq->fifo.next);
2087
	if (time_before(jiffies, rq_fifo_time(rq)))
2088
		rq = NULL;
2089

2090
	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2091
	return rq;
2092
}
2093

2094
static inline int
2095
cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2096
{
2097
	const int base_rq = cfqd->cfq_slice_async_rq;
2098

2099
	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2100

2101
	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2102
}
2103

2104
/*
2105
 * Must be called with the queue_lock held.
2106
 */
2107
static int cfqq_process_refs(struct cfq_queue *cfqq)
2108
{
2109
	int process_refs, io_refs;
2110

2111
	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2112
	process_refs = cfqq->ref - io_refs;
2113
	BUG_ON(process_refs < 0);
2114
	return process_refs;
2115
}
2116

2117
static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2118
{
2119
	int process_refs, new_process_refs;
2120
	struct cfq_queue *__cfqq;
2121

2122
	/*
2123
	 * If there are no process references on the new_cfqq, then it is
2124
	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2125
	 * chain may have dropped their last reference (not just their
2126
	 * last process reference).
2127
	 */
2128
	if (!cfqq_process_refs(new_cfqq))
2129
		return;
2130

2131
	/* Avoid a circular list and skip interim queue merges */
2132
	while ((__cfqq = new_cfqq->new_cfqq)) {
2133
		if (__cfqq == cfqq)
2134
			return;
2135
		new_cfqq = __cfqq;
2136
	}
2137

2138
	process_refs = cfqq_process_refs(cfqq);
2139
	new_process_refs = cfqq_process_refs(new_cfqq);
2140
	/*
2141
	 * If the process for the cfqq has gone away, there is no
2142
	 * sense in merging the queues.
2143
	 */
2144
	if (process_refs == 0 || new_process_refs == 0)
2145
		return;
2146

2147
	/*
2148
	 * Merge in the direction of the lesser amount of work.
2149
	 */
2150
	if (new_process_refs >= process_refs) {
2151
		cfqq->new_cfqq = new_cfqq;
2152
		new_cfqq->ref += process_refs;
2153
	} else {
2154
		new_cfqq->new_cfqq = cfqq;
2155
		cfqq->ref += new_process_refs;
2156
	}
2157
}
2158

2159
static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2160
				struct cfq_group *cfqg, enum wl_prio_t prio)
2161
{
2162
	struct cfq_queue *queue;
2163
	int i;
2164
	bool key_valid = false;
2165
	unsigned long lowest_key = 0;
2166
	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2167

2168
	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2169
		/* select the one with lowest rb_key */
2170
		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2171
		if (queue &&
2172
		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
2173
			lowest_key = queue->rb_key;
2174
			cur_best = i;
2175
			key_valid = true;
2176
		}
2177
	}
2178

2179
	return cur_best;
2180
}
2181

2182
static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2183
{
2184
	unsigned slice;
2185
	unsigned count;
2186
	struct cfq_rb_root *st;
2187
	unsigned group_slice;
2188
	enum wl_prio_t original_prio = cfqd->serving_prio;
2189

2190
	/* Choose next priority. RT > BE > IDLE */
2191
	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2192
		cfqd->serving_prio = RT_WORKLOAD;
2193
	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2194
		cfqd->serving_prio = BE_WORKLOAD;
2195
	else {
2196
		cfqd->serving_prio = IDLE_WORKLOAD;
2197
		cfqd->workload_expires = jiffies + 1;
2198
		return;
2199
	}
2200

2201
	if (original_prio != cfqd->serving_prio)
2202
		goto new_workload;
2203

2204
	/*
2205
	 * For RT and BE, we have to choose also the type
2206
	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2207
	 * expiration time
2208
	 */
2209
	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2210
	count = st->count;
2211

2212
	/*
2213
	 * check workload expiration, and that we still have other queues ready
2214
	 */
2215
	if (count && !time_after(jiffies, cfqd->workload_expires))
2216
		return;
2217

2218
new_workload:
2219
	/* otherwise select new workload type */
2220
	cfqd->serving_type =
2221
		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2222
	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2223
	count = st->count;
2224

2225
	/*
2226
	 * the workload slice is computed as a fraction of target latency
2227
	 * proportional to the number of queues in that workload, over
2228
	 * all the queues in the same priority class
2229
	 */
2230
	group_slice = cfq_group_slice(cfqd, cfqg);
2231

2232
	slice = group_slice * count /
2233
		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2234
		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2235

2236
	if (cfqd->serving_type == ASYNC_WORKLOAD) {
2237
		unsigned int tmp;
2238

2239
		/*
2240
		 * Async queues are currently system wide. Just taking
2241
		 * proportion of queues with-in same group will lead to higher
2242
		 * async ratio system wide as generally root group is going
2243
		 * to have higher weight. A more accurate thing would be to
2244
		 * calculate system wide asnc/sync ratio.
2245
		 */
2246
		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2247
		tmp = tmp/cfqd->busy_queues;
2248
		slice = min_t(unsigned, slice, tmp);
2249

2250
		/* async workload slice is scaled down according to
2251
		 * the sync/async slice ratio. */
2252
		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2253
	} else
2254
		/* sync workload slice is at least 2 * cfq_slice_idle */
2255
		slice = max(slice, 2 * cfqd->cfq_slice_idle);
2256

2257
	slice = max_t(unsigned, slice, CFQ_MIN_TT);
2258
	cfq_log(cfqd, "workload slice:%d", slice);
2259
	cfqd->workload_expires = jiffies + slice;
2260
}
2261

2262
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2263
{
2264
	struct cfq_rb_root *st = &cfqd->grp_service_tree;
2265
	struct cfq_group *cfqg;
2266

2267
	if (RB_EMPTY_ROOT(&st->rb))
2268
		return NULL;
2269
	cfqg = cfq_rb_first_group(st);
2270
	update_min_vdisktime(st);
2271
	return cfqg;
2272
}
2273

2274
static void cfq_choose_cfqg(struct cfq_data *cfqd)
2275
{
2276
	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2277

2278
	cfqd->serving_group = cfqg;
2279

2280
	/* Restore the workload type data */
2281
	if (cfqg->saved_workload_slice) {
2282
		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2283
		cfqd->serving_type = cfqg->saved_workload;
2284
		cfqd->serving_prio = cfqg->saved_serving_prio;
2285
	} else
2286
		cfqd->workload_expires = jiffies - 1;
2287

2288
	choose_service_tree(cfqd, cfqg);
2289
}
2290

2291
/*
2292
 * Select a queue for service. If we have a current active queue,
2293
 * check whether to continue servicing it, or retrieve and set a new one.
2294
 */
2295
static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2296
{
2297
	struct cfq_queue *cfqq, *new_cfqq = NULL;
2298

2299
	cfqq = cfqd->active_queue;
2300
	if (!cfqq)
2301
		goto new_queue;
2302

2303
	if (!cfqd->rq_queued)
2304
		return NULL;
2305

2306
	/*
2307
	 * We were waiting for group to get backlogged. Expire the queue
2308
	 */
2309
	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2310
		goto expire;
2311

2312
	/*
2313
	 * The active queue has run out of time, expire it and select new.
2314
	 */
2315
	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2316
		/*
2317
		 * If slice had not expired at the completion of last request
2318
		 * we might not have turned on wait_busy flag. Don't expire
2319
		 * the queue yet. Allow the group to get backlogged.
2320
		 *
2321
		 * The very fact that we have used the slice, that means we
2322
		 * have been idling all along on this queue and it should be
2323
		 * ok to wait for this request to complete.
2324
		 */
2325
		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2326
		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2327
			cfqq = NULL;
2328
			goto keep_queue;
2329
		} else
2330
			goto check_group_idle;
2331
	}
2332

2333
	/*
2334
	 * The active queue has requests and isn't expired, allow it to
2335
	 * dispatch.
2336
	 */
2337
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2338
		goto keep_queue;
2339

2340
	/*
2341
	 * If another queue has a request waiting within our mean seek
2342
	 * distance, let it run.  The expire code will check for close
2343
	 * cooperators and put the close queue at the front of the service
2344
	 * tree.  If possible, merge the expiring queue with the new cfqq.
2345
	 */
2346
	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2347
	if (new_cfqq) {
2348
		if (!cfqq->new_cfqq)
2349
			cfq_setup_merge(cfqq, new_cfqq);
2350
		goto expire;
2351
	}
2352

2353
	/*
2354
	 * No requests pending. If the active queue still has requests in
2355
	 * flight or is idling for a new request, allow either of these
2356
	 * conditions to happen (or time out) before selecting a new queue.
2357
	 */
2358
	if (timer_pending(&cfqd->idle_slice_timer)) {
2359
		cfqq = NULL;
2360
		goto keep_queue;
2361
	}
2362

2363
	/*
2364
	 * This is a deep seek queue, but the device is much faster than
2365
	 * the queue can deliver, don't idle
2366
	 **/
2367
	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2368
	    (cfq_cfqq_slice_new(cfqq) ||
2369
	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2370
		cfq_clear_cfqq_deep(cfqq);
2371
		cfq_clear_cfqq_idle_window(cfqq);
2372
	}
2373

2374
	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2375
		cfqq = NULL;
2376
		goto keep_queue;
2377
	}
2378

2379
	/*
2380
	 * If group idle is enabled and there are requests dispatched from
2381
	 * this group, wait for requests to complete.
2382
	 */
2383
check_group_idle:
2384
	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
2385
	    && cfqq->cfqg->dispatched) {
2386
		cfqq = NULL;
2387
		goto keep_queue;
2388
	}
2389

2390
expire:
2391
	cfq_slice_expired(cfqd, 0);
2392
new_queue:
2393
	/*
2394
	 * Current queue expired. Check if we have to switch to a new
2395
	 * service tree
2396
	 */
2397
	if (!new_cfqq)
2398
		cfq_choose_cfqg(cfqd);
2399

2400
	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2401
keep_queue:
2402
	return cfqq;
2403
}
2404

2405
static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2406
{
2407
	int dispatched = 0;
2408

2409
	while (cfqq->next_rq) {
2410
		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2411
		dispatched++;
2412
	}
2413

2414
	BUG_ON(!list_empty(&cfqq->fifo));
2415

2416
	/* By default cfqq is not expired if it is empty. Do it explicitly */
2417
	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2418
	return dispatched;
2419
}
2420

2421
/*
2422
 * Drain our current requests. Used for barriers and when switching
2423
 * io schedulers on-the-fly.
2424
 */
2425
static int cfq_forced_dispatch(struct cfq_data *cfqd)
2426
{
2427
	struct cfq_queue *cfqq;
2428
	int dispatched = 0;
2429

2430
	/* Expire the timeslice of the current active queue first */
2431
	cfq_slice_expired(cfqd, 0);
2432
	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2433
		__cfq_set_active_queue(cfqd, cfqq);
2434
		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2435
	}
2436

2437
	BUG_ON(cfqd->busy_queues);
2438

2439
	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2440
	return dispatched;
2441
}
2442

2443
static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2444
	struct cfq_queue *cfqq)
2445
{
2446
	/* the queue hasn't finished any request, can't estimate */
2447
	if (cfq_cfqq_slice_new(cfqq))
2448
		return true;
2449
	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2450
		cfqq->slice_end))
2451
		return true;
2452

2453
	return false;
2454
}
2455

2456
static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2457
{
2458
	unsigned int max_dispatch;
2459

2460
	/*
2461
	 * Drain async requests before we start sync IO
2462
	 */
2463
	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2464
		return false;
2465

2466
	/*
2467
	 * If this is an async queue and we have sync IO in flight, let it wait
2468
	 */
2469
	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2470
		return false;
2471

2472
	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2473
	if (cfq_class_idle(cfqq))
2474
		max_dispatch = 1;
2475

2476
	/*
2477
	 * Does this cfqq already have too much IO in flight?
2478
	 */
2479
	if (cfqq->dispatched >= max_dispatch) {
2480
		bool promote_sync = false;
2481
		/*
2482
		 * idle queue must always only have a single IO in flight
2483
		 */
2484
		if (cfq_class_idle(cfqq))
2485
			return false;
2486

2487
		/*
2488
		 * If there is only one sync queue
2489
		 * we can ignore async queue here and give the sync
2490
		 * queue no dispatch limit. The reason is a sync queue can
2491
		 * preempt async queue, limiting the sync queue doesn't make
2492
		 * sense. This is useful for aiostress test.
2493
		 */
2494
		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2495
			promote_sync = true;
2496

2497
		/*
2498
		 * We have other queues, don't allow more IO from this one
2499
		 */
2500
		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2501
				!promote_sync)
2502
			return false;
2503

2504
		/*
2505
		 * Sole queue user, no limit
2506
		 */
2507
		if (cfqd->busy_queues == 1 || promote_sync)
2508
			max_dispatch = -1;
2509
		else
2510
			/*
2511
			 * Normally we start throttling cfqq when cfq_quantum/2
2512
			 * requests have been dispatched. But we can drive
2513
			 * deeper queue depths at the beginning of slice
2514
			 * subjected to upper limit of cfq_quantum.
2515
			 * */
2516
			max_dispatch = cfqd->cfq_quantum;
2517
	}
2518

2519
	/*
2520
	 * Async queues must wait a bit before being allowed dispatch.
2521
	 * We also ramp up the dispatch depth gradually for async IO,
2522
	 * based on the last sync IO we serviced
2523
	 */
2524
	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2525
		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2526
		unsigned int depth;
2527

2528
		depth = last_sync / cfqd->cfq_slice[1];
2529
		if (!depth && !cfqq->dispatched)
2530
			depth = 1;
2531
		if (depth < max_dispatch)
2532
			max_dispatch = depth;
2533
	}
2534

2535
	/*
2536
	 * If we're below the current max, allow a dispatch
2537
	 */
2538
	return cfqq->dispatched < max_dispatch;
2539
}
2540

2541
/*
2542
 * Dispatch a request from cfqq, moving them to the request queue
2543
 * dispatch list.
2544
 */
2545
static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2546
{
2547
	struct request *rq;
2548

2549
	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2550

2551
	if (!cfq_may_dispatch(cfqd, cfqq))
2552
		return false;
2553

2554
	/*
2555
	 * follow expired path, else get first next available
2556
	 */
2557
	rq = cfq_check_fifo(cfqq);
2558
	if (!rq)
2559
		rq = cfqq->next_rq;
2560

2561
	/*
2562
	 * insert request into driver dispatch list
2563
	 */
2564
	cfq_dispatch_insert(cfqd->queue, rq);
2565

2566
	if (!cfqd->active_cic) {
2567
		struct cfq_io_context *cic = RQ_CIC(rq);
2568

2569
		atomic_long_inc(&cic->ioc->refcount);
2570
		cfqd->active_cic = cic;
2571
	}
2572

2573
	return true;
2574
}
2575

2576
/*
2577
 * Find the cfqq that we need to service and move a request from that to the
2578
 * dispatch list
2579
 */
2580
static int cfq_dispatch_requests(struct request_queue *q, int force)
2581
{
2582
	struct cfq_data *cfqd = q->elevator->elevator_data;
2583
	struct cfq_queue *cfqq;
2584

2585
	if (!cfqd->busy_queues)
2586
		return 0;
2587

2588
	if (unlikely(force))
2589
		return cfq_forced_dispatch(cfqd);
2590

2591
	cfqq = cfq_select_queue(cfqd);
2592
	if (!cfqq)
2593
		return 0;
2594

2595
	/*
2596
	 * Dispatch a request from this cfqq, if it is allowed
2597
	 */
2598
	if (!cfq_dispatch_request(cfqd, cfqq))
2599
		return 0;
2600

2601
	cfqq->slice_dispatch++;
2602
	cfq_clear_cfqq_must_dispatch(cfqq);
2603

2604
	/*
2605
	 * expire an async queue immediately if it has used up its slice. idle
2606
	 * queue always expire after 1 dispatch round.
2607
	 */
2608
	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2609
	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2610
	    cfq_class_idle(cfqq))) {
2611
		cfqq->slice_end = jiffies + 1;
2612
		cfq_slice_expired(cfqd, 0);
2613
	}
2614

2615
	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2616
	return 1;
2617
}
2618

2619
/*
2620
 * task holds one reference to the queue, dropped when task exits. each rq
2621
 * in-flight on this queue also holds a reference, dropped when rq is freed.
2622
 *
2623
 * Each cfq queue took a reference on the parent group. Drop it now.
2624
 * queue lock must be held here.
2625
 */
2626
static void cfq_put_queue(struct cfq_queue *cfqq)
2627
{
2628
	struct cfq_data *cfqd = cfqq->cfqd;
2629
	struct cfq_group *cfqg;
2630

2631
	BUG_ON(cfqq->ref <= 0);
2632

2633
	cfqq->ref--;
2634
	if (cfqq->ref)
2635
		return;
2636

2637
	cfq_log_cfqq(cfqd, cfqq, "put_queue");
2638
	BUG_ON(rb_first(&cfqq->sort_list));
2639
	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2640
	cfqg = cfqq->cfqg;
2641

2642
	if (unlikely(cfqd->active_queue == cfqq)) {
2643
		__cfq_slice_expired(cfqd, cfqq, 0);
2644
		cfq_schedule_dispatch(cfqd);
2645
	}
2646

2647
	BUG_ON(cfq_cfqq_on_rr(cfqq));
2648
	kmem_cache_free(cfq_pool, cfqq);
2649
	cfq_put_cfqg(cfqg);
2650
}
2651

2652
/*
2653
 * Call func for each cic attached to this ioc.
2654
 */
2655
static void
2656
call_for_each_cic(struct io_context *ioc,
2657
		  void (*func)(struct io_context *, struct cfq_io_context *))
2658
{
2659
	struct cfq_io_context *cic;
2660
	struct hlist_node *n;
2661

2662
	rcu_read_lock();
2663

2664
	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2665
		func(ioc, cic);
2666

2667
	rcu_read_unlock();
2668
}
2669

2670
static void cfq_cic_free_rcu(struct rcu_head *head)
2671
{
2672
	struct cfq_io_context *cic;
2673

2674
	cic = container_of(head, struct cfq_io_context, rcu_head);
2675

2676
	kmem_cache_free(cfq_ioc_pool, cic);
2677
	elv_ioc_count_dec(cfq_ioc_count);
2678

2679
	if (ioc_gone) {
2680
		/*
2681
		 * CFQ scheduler is exiting, grab exit lock and check
2682
		 * the pending io context count. If it hits zero,
2683
		 * complete ioc_gone and set it back to NULL
2684
		 */
2685
		spin_lock(&ioc_gone_lock);
2686
		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2687
			complete(ioc_gone);
2688
			ioc_gone = NULL;
2689
		}
2690
		spin_unlock(&ioc_gone_lock);
2691
	}
2692
}
2693

2694
static void cfq_cic_free(struct cfq_io_context *cic)
2695
{
2696
	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2697
}
2698

2699
static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2700
{
2701
	unsigned long flags;
2702
	unsigned long dead_key = (unsigned long) cic->key;
2703

2704
	BUG_ON(!(dead_key & CIC_DEAD_KEY));
2705

2706
	spin_lock_irqsave(&ioc->lock, flags);
2707
	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2708
	hlist_del_rcu(&cic->cic_list);
2709
	spin_unlock_irqrestore(&ioc->lock, flags);
2710

2711
	cfq_cic_free(cic);
2712
}
2713

2714
/*
2715
 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2716
 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2717
 * and ->trim() which is called with the task lock held
2718
 */
2719
static void cfq_free_io_context(struct io_context *ioc)
2720
{
2721
	/*
2722
	 * ioc->refcount is zero here, or we are called from elv_unregister(),
2723
	 * so no more cic's are allowed to be linked into this ioc.  So it
2724
	 * should be ok to iterate over the known list, we will see all cic's
2725
	 * since no new ones are added.
2726
	 */
2727
	call_for_each_cic(ioc, cic_free_func);
2728
}
2729

2730
static void cfq_put_cooperator(struct cfq_queue *cfqq)
2731
{
2732
	struct cfq_queue *__cfqq, *next;
2733

2734
	/*
2735
	 * If this queue was scheduled to merge with another queue, be
2736
	 * sure to drop the reference taken on that queue (and others in
2737
	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2738
	 */
2739
	__cfqq = cfqq->new_cfqq;
2740
	while (__cfqq) {
2741
		if (__cfqq == cfqq) {
2742
			WARN(1, "cfqq->new_cfqq loop detected\n");
2743
			break;
2744
		}
2745
		next = __cfqq->new_cfqq;
2746
		cfq_put_queue(__cfqq);
2747
		__cfqq = next;
2748
	}
2749
}
2750

2751
static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2752
{
2753
	if (unlikely(cfqq == cfqd->active_queue)) {
2754
		__cfq_slice_expired(cfqd, cfqq, 0);
2755
		cfq_schedule_dispatch(cfqd);
2756
	}
2757

2758
	cfq_put_cooperator(cfqq);
2759

2760
	cfq_put_queue(cfqq);
2761
}
2762

2763
static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2764
					 struct cfq_io_context *cic)
2765
{
2766
	struct io_context *ioc = cic->ioc;
2767

2768
	list_del_init(&cic->queue_list);
2769

2770
	/*
2771
	 * Make sure dead mark is seen for dead queues
2772
	 */
2773
	smp_wmb();
2774
	cic->key = cfqd_dead_key(cfqd);
2775

2776
	rcu_read_lock();
2777
	if (rcu_dereference(ioc->ioc_data) == cic) {
2778
		rcu_read_unlock();
2779
		spin_lock(&ioc->lock);
2780
		rcu_assign_pointer(ioc->ioc_data, NULL);
2781
		spin_unlock(&ioc->lock);
2782
	} else
2783
		rcu_read_unlock();
2784

2785
	if (cic->cfqq[BLK_RW_ASYNC]) {
2786
		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2787
		cic->cfqq[BLK_RW_ASYNC] = NULL;
2788
	}
2789

2790
	if (cic->cfqq[BLK_RW_SYNC]) {
2791
		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2792
		cic->cfqq[BLK_RW_SYNC] = NULL;
2793
	}
2794
}
2795

2796
static void cfq_exit_single_io_context(struct io_context *ioc,
2797
				       struct cfq_io_context *cic)
2798
{
2799
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2800

2801
	if (cfqd) {
2802
		struct request_queue *q = cfqd->queue;
2803
		unsigned long flags;
2804

2805
		spin_lock_irqsave(q->queue_lock, flags);
2806

2807
		/*
2808
		 * Ensure we get a fresh copy of the ->key to prevent
2809
		 * race between exiting task and queue
2810
		 */
2811
		smp_read_barrier_depends();
2812
		if (cic->key == cfqd)
2813
			__cfq_exit_single_io_context(cfqd, cic);
2814

2815
		spin_unlock_irqrestore(q->queue_lock, flags);
2816
	}
2817
}
2818

2819
/*
2820
 * The process that ioc belongs to has exited, we need to clean up
2821
 * and put the internal structures we have that belongs to that process.
2822
 */
2823
static void cfq_exit_io_context(struct io_context *ioc)
2824
{
2825
	call_for_each_cic(ioc, cfq_exit_single_io_context);
2826
}
2827

2828
static struct cfq_io_context *
2829
cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2830
{
2831
	struct cfq_io_context *cic;
2832

2833
	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2834
							cfqd->queue->node);
2835
	if (cic) {
2836
		cic->last_end_request = jiffies;
2837
		INIT_LIST_HEAD(&cic->queue_list);
2838
		INIT_HLIST_NODE(&cic->cic_list);
2839
		cic->dtor = cfq_free_io_context;
2840
		cic->exit = cfq_exit_io_context;
2841
		elv_ioc_count_inc(cfq_ioc_count);
2842
	}
2843

2844
	return cic;
2845
}
2846

2847
static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2848
{
2849
	struct task_struct *tsk = current;
2850
	int ioprio_class;
2851

2852
	if (!cfq_cfqq_prio_changed(cfqq))
2853
		return;
2854

2855
	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2856
	switch (ioprio_class) {
2857
	default:
2858
		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2859
	case IOPRIO_CLASS_NONE:
2860
		/*
2861
		 * no prio set, inherit CPU scheduling settings
2862
		 */
2863
		cfqq->ioprio = task_nice_ioprio(tsk);
2864
		cfqq->ioprio_class = task_nice_ioclass(tsk);
2865
		break;
2866
	case IOPRIO_CLASS_RT:
2867
		cfqq->ioprio = task_ioprio(ioc);
2868
		cfqq->ioprio_class = IOPRIO_CLASS_RT;
2869
		break;
2870
	case IOPRIO_CLASS_BE:
2871
		cfqq->ioprio = task_ioprio(ioc);
2872
		cfqq->ioprio_class = IOPRIO_CLASS_BE;
2873
		break;
2874
	case IOPRIO_CLASS_IDLE:
2875
		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2876
		cfqq->ioprio = 7;
2877
		cfq_clear_cfqq_idle_window(cfqq);
2878
		break;
2879
	}
2880

2881
	/*
2882
	 * keep track of original prio settings in case we have to temporarily
2883
	 * elevate the priority of this queue
2884
	 */
2885
	cfqq->org_ioprio = cfqq->ioprio;
2886
	cfqq->org_ioprio_class = cfqq->ioprio_class;
2887
	cfq_clear_cfqq_prio_changed(cfqq);
2888
}
2889

2890
static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2891
{
2892
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2893
	struct cfq_queue *cfqq;
2894
	unsigned long flags;
2895

2896
	if (unlikely(!cfqd))
2897
		return;
2898

2899
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2900

2901
	cfqq = cic->cfqq[BLK_RW_ASYNC];
2902
	if (cfqq) {
2903
		struct cfq_queue *new_cfqq;
2904
		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2905
						GFP_ATOMIC);
2906
		if (new_cfqq) {
2907
			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2908
			cfq_put_queue(cfqq);
2909
		}
2910
	}
2911

2912
	cfqq = cic->cfqq[BLK_RW_SYNC];
2913
	if (cfqq)
2914
		cfq_mark_cfqq_prio_changed(cfqq);
2915

2916
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2917
}
2918

2919
static void cfq_ioc_set_ioprio(struct io_context *ioc)
2920
{
2921
	call_for_each_cic(ioc, changed_ioprio);
2922
	ioc->ioprio_changed = 0;
2923
}
2924

2925
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2926
			  pid_t pid, bool is_sync)
2927
{
2928
	RB_CLEAR_NODE(&cfqq->rb_node);
2929
	RB_CLEAR_NODE(&cfqq->p_node);
2930
	INIT_LIST_HEAD(&cfqq->fifo);
2931

2932
	cfqq->ref = 0;
2933
	cfqq->cfqd = cfqd;
2934

2935
	cfq_mark_cfqq_prio_changed(cfqq);
2936

2937
	if (is_sync) {
2938
		if (!cfq_class_idle(cfqq))
2939
			cfq_mark_cfqq_idle_window(cfqq);
2940
		cfq_mark_cfqq_sync(cfqq);
2941
	}
2942
	cfqq->pid = pid;
2943
}
2944

2945
#ifdef CONFIG_CFQ_GROUP_IOSCHED
2946
static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2947
{
2948
	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2949
	struct cfq_data *cfqd = cic_to_cfqd(cic);
2950
	unsigned long flags;
2951
	struct request_queue *q;
2952

2953
	if (unlikely(!cfqd))
2954
		return;
2955

2956
	q = cfqd->queue;
2957

2958
	spin_lock_irqsave(q->queue_lock, flags);
2959

2960
	if (sync_cfqq) {
2961
		/*
2962
		 * Drop reference to sync queue. A new sync queue will be
2963
		 * assigned in new group upon arrival of a fresh request.
2964
		 */
2965
		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2966
		cic_set_cfqq(cic, NULL, 1);
2967
		cfq_put_queue(sync_cfqq);
2968
	}
2969

2970
	spin_unlock_irqrestore(q->queue_lock, flags);
2971
}
2972

2973
static void cfq_ioc_set_cgroup(struct io_context *ioc)
2974
{
2975
	call_for_each_cic(ioc, changed_cgroup);
2976
	ioc->cgroup_changed = 0;
2977
}
2978
#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2979

2980
static struct cfq_queue *
2981
cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2982
		     struct io_context *ioc, gfp_t gfp_mask)
2983
{
2984
	struct cfq_queue *cfqq, *new_cfqq = NULL;
2985
	struct cfq_io_context *cic;
2986
	struct cfq_group *cfqg;
2987

2988
retry:
2989
	cfqg = cfq_get_cfqg(cfqd);
2990
	cic = cfq_cic_lookup(cfqd, ioc);
2991
	/* cic always exists here */
2992
	cfqq = cic_to_cfqq(cic, is_sync);
2993

2994
	/*
2995
	 * Always try a new alloc if we fell back to the OOM cfqq
2996
	 * originally, since it should just be a temporary situation.
2997
	 */
2998
	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2999
		cfqq = NULL;
3000
		if (new_cfqq) {
3001
			cfqq = new_cfqq;
3002
			new_cfqq = NULL;
3003
		} else if (gfp_mask & __GFP_WAIT) {
3004
			spin_unlock_irq(cfqd->queue->queue_lock);
3005
			new_cfqq = kmem_cache_alloc_node(cfq_pool,
3006
					gfp_mask | __GFP_ZERO,
3007
					cfqd->queue->node);
3008
			spin_lock_irq(cfqd->queue->queue_lock);
3009
			if (new_cfqq)
3010
				goto retry;
3011
		} else {
3012
			cfqq = kmem_cache_alloc_node(cfq_pool,
3013
					gfp_mask | __GFP_ZERO,
3014
					cfqd->queue->node);
3015
		}
3016

3017
		if (cfqq) {
3018
			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3019
			cfq_init_prio_data(cfqq, ioc);
3020
			cfq_link_cfqq_cfqg(cfqq, cfqg);
3021
			cfq_log_cfqq(cfqd, cfqq, "alloced");
3022
		} else
3023
			cfqq = &cfqd->oom_cfqq;
3024
	}
3025

3026
	if (new_cfqq)
3027
		kmem_cache_free(cfq_pool, new_cfqq);
3028

3029
	return cfqq;
3030
}
3031

3032
static struct cfq_queue **
3033
cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3034
{
3035
	switch (ioprio_class) {
3036
	case IOPRIO_CLASS_RT:
3037
		return &cfqd->async_cfqq[0][ioprio];
3038
	case IOPRIO_CLASS_BE:
3039
		return &cfqd->async_cfqq[1][ioprio];
3040
	case IOPRIO_CLASS_IDLE:
3041
		return &cfqd->async_idle_cfqq;
3042
	default:
3043
		BUG();
3044
	}
3045
}
3046

3047
static struct cfq_queue *
3048
cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
3049
	      gfp_t gfp_mask)
3050
{
3051
	const int ioprio = task_ioprio(ioc);
3052
	const int ioprio_class = task_ioprio_class(ioc);
3053
	struct cfq_queue **async_cfqq = NULL;
3054
	struct cfq_queue *cfqq = NULL;
3055

3056
	if (!is_sync) {
3057
		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3058
		cfqq = *async_cfqq;
3059
	}
3060

3061
	if (!cfqq)
3062
		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
3063

3064
	/*
3065
	 * pin the queue now that it's allocated, scheduler exit will prune it
3066
	 */
3067
	if (!is_sync && !(*async_cfqq)) {
3068
		cfqq->ref++;
3069
		*async_cfqq = cfqq;
3070
	}
3071

3072
	cfqq->ref++;
3073
	return cfqq;
3074
}
3075

3076
/*
3077
 * We drop cfq io contexts lazily, so we may find a dead one.
3078
 */
3079
static void
3080
cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
3081
		  struct cfq_io_context *cic)
3082
{
3083
	unsigned long flags;
3084

3085
	WARN_ON(!list_empty(&cic->queue_list));
3086
	BUG_ON(cic->key != cfqd_dead_key(cfqd));
3087

3088
	spin_lock_irqsave(&ioc->lock, flags);
3089

3090
	BUG_ON(rcu_dereference_check(ioc->ioc_data,
3091
		lockdep_is_held(&ioc->lock)) == cic);
3092

3093
	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
3094
	hlist_del_rcu(&cic->cic_list);
3095
	spin_unlock_irqrestore(&ioc->lock, flags);
3096

3097
	cfq_cic_free(cic);
3098
}
3099

3100
static struct cfq_io_context *
3101
cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
3102
{
3103
	struct cfq_io_context *cic;
3104
	unsigned long flags;
3105

3106
	if (unlikely(!ioc))
3107
		return NULL;
3108

3109
	rcu_read_lock();
3110

3111
	/*
3112
	 * we maintain a last-hit cache, to avoid browsing over the tree
3113
	 */
3114
	cic = rcu_dereference(ioc->ioc_data);
3115
	if (cic && cic->key == cfqd) {
3116
		rcu_read_unlock();
3117
		return cic;
3118
	}
3119

3120
	do {
3121
		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3122
		rcu_read_unlock();
3123
		if (!cic)
3124
			break;
3125
		if (unlikely(cic->key != cfqd)) {
3126
			cfq_drop_dead_cic(cfqd, ioc, cic);
3127
			rcu_read_lock();
3128
			continue;
3129
		}
3130

3131
		spin_lock_irqsave(&ioc->lock, flags);
3132
		rcu_assign_pointer(ioc->ioc_data, cic);
3133
		spin_unlock_irqrestore(&ioc->lock, flags);
3134
		break;
3135
	} while (1);
3136

3137
	return cic;
3138
}
3139

3140
/*
3141
 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3142
 * the process specific cfq io context when entered from the block layer.
3143
 * Also adds the cic to a per-cfqd list, used when this queue is removed.
3144
 */
3145
static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3146
			struct cfq_io_context *cic, gfp_t gfp_mask)
3147
{
3148
	unsigned long flags;
3149
	int ret;
3150

3151
	ret = radix_tree_preload(gfp_mask);
3152
	if (!ret) {
3153
		cic->ioc = ioc;
3154
		cic->key = cfqd;
3155

3156
		spin_lock_irqsave(&ioc->lock, flags);
3157
		ret = radix_tree_insert(&ioc->radix_root,
3158
						cfqd->cic_index, cic);
3159
		if (!ret)
3160
			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3161
		spin_unlock_irqrestore(&ioc->lock, flags);
3162

3163
		radix_tree_preload_end();
3164

3165
		if (!ret) {
3166
			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3167
			list_add(&cic->queue_list, &cfqd->cic_list);
3168
			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3169
		}
3170
	}
3171

3172
	if (ret)
3173
		printk(KERN_ERR "cfq: cic link failed!\n");
3174

3175
	return ret;
3176
}
3177

3178
/*
3179
 * Setup general io context and cfq io context. There can be several cfq
3180
 * io contexts per general io context, if this process is doing io to more
3181
 * than one device managed by cfq.
3182
 */
3183
static struct cfq_io_context *
3184
cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3185
{
3186
	struct io_context *ioc = NULL;
3187
	struct cfq_io_context *cic;
3188

3189
	might_sleep_if(gfp_mask & __GFP_WAIT);
3190

3191
	ioc = get_io_context(gfp_mask, cfqd->queue->node);
3192
	if (!ioc)
3193
		return NULL;
3194

3195
	cic = cfq_cic_lookup(cfqd, ioc);
3196
	if (cic)
3197
		goto out;
3198

3199
	cic = cfq_alloc_io_context(cfqd, gfp_mask);
3200
	if (cic == NULL)
3201
		goto err;
3202

3203
	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
3204
		goto err_free;
3205

3206
out:
3207
	smp_read_barrier_depends();
3208
	if (unlikely(ioc->ioprio_changed))
3209
		cfq_ioc_set_ioprio(ioc);
3210

3211
#ifdef CONFIG_CFQ_GROUP_IOSCHED
3212
	if (unlikely(ioc->cgroup_changed))
3213
		cfq_ioc_set_cgroup(ioc);
3214
#endif
3215
	return cic;
3216
err_free:
3217
	cfq_cic_free(cic);
3218
err:
3219
	put_io_context(ioc);
3220
	return NULL;
3221
}
3222

3223
static void
3224
cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
3225
{
3226
	unsigned long elapsed = jiffies - cic->last_end_request;
3227
	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3228

3229
	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
3230
	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
3231
	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
3232
}
3233

3234
static void
3235
cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3236
		       struct request *rq)
3237
{
3238
	sector_t sdist = 0;
3239
	sector_t n_sec = blk_rq_sectors(rq);
3240
	if (cfqq->last_request_pos) {
3241
		if (cfqq->last_request_pos < blk_rq_pos(rq))
3242
			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3243
		else
3244
			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3245
	}
3246

3247
	cfqq->seek_history <<= 1;
3248
	if (blk_queue_nonrot(cfqd->queue))
3249
		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3250
	else
3251
		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3252
}
3253

3254
/*
3255
 * Disable idle window if the process thinks too long or seeks so much that
3256
 * it doesn't matter
3257
 */
3258
static void
3259
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3260
		       struct cfq_io_context *cic)
3261
{
3262
	int old_idle, enable_idle;
3263

3264
	/*
3265
	 * Don't idle for async or idle io prio class
3266
	 */
3267
	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3268
		return;
3269

3270
	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3271

3272
	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3273
		cfq_mark_cfqq_deep(cfqq);
3274

3275
	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3276
		enable_idle = 0;
3277
	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3278
	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3279
		enable_idle = 0;
3280
	else if (sample_valid(cic->ttime_samples)) {
3281
		if (cic->ttime_mean > cfqd->cfq_slice_idle)
3282
			enable_idle = 0;
3283
		else
3284
			enable_idle = 1;
3285
	}
3286

3287
	if (old_idle != enable_idle) {
3288
		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3289
		if (enable_idle)
3290
			cfq_mark_cfqq_idle_window(cfqq);
3291
		else
3292
			cfq_clear_cfqq_idle_window(cfqq);
3293
	}
3294
}
3295

3296
/*
3297
 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3298
 * no or if we aren't sure, a 1 will cause a preempt.
3299
 */
3300
static bool
3301
cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3302
		   struct request *rq)
3303
{
3304
	struct cfq_queue *cfqq;
3305

3306
	cfqq = cfqd->active_queue;
3307
	if (!cfqq)
3308
		return false;
3309

3310
	if (cfq_class_idle(new_cfqq))
3311
		return false;
3312

3313
	if (cfq_class_idle(cfqq))
3314
		return true;
3315

3316
	/*
3317
	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3318
	 */
3319
	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3320
		return false;
3321

3322
	/*
3323
	 * if the new request is sync, but the currently running queue is
3324
	 * not, let the sync request have priority.
3325
	 */
3326
	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3327
		return true;
3328

3329
	if (new_cfqq->cfqg != cfqq->cfqg)
3330
		return false;
3331

3332
	if (cfq_slice_used(cfqq))
3333
		return true;
3334

3335
	/* Allow preemption only if we are idling on sync-noidle tree */
3336
	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3337
	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3338
	    new_cfqq->service_tree->count == 2 &&
3339
	    RB_EMPTY_ROOT(&cfqq->sort_list))
3340
		return true;
3341

3342
	/*
3343
	 * So both queues are sync. Let the new request get disk time if
3344
	 * it's a metadata request and the current queue is doing regular IO.
3345
	 */
3346
	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
3347
		return true;
3348

3349
	/*
3350
	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3351
	 */
3352
	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3353
		return true;
3354

3355
	/* An idle queue should not be idle now for some reason */
3356
	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3357
		return true;
3358

3359
	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3360
		return false;
3361

3362
	/*
3363
	 * if this request is as-good as one we would expect from the
3364
	 * current cfqq, let it preempt
3365
	 */
3366
	if (cfq_rq_close(cfqd, cfqq, rq))
3367
		return true;
3368

3369
	return false;
3370
}
3371

3372
/*
3373
 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3374
 * let it have half of its nominal slice.
3375
 */
3376
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3377
{
3378
	struct cfq_queue *old_cfqq = cfqd->active_queue;
3379

3380
	cfq_log_cfqq(cfqd, cfqq, "preempt");
3381
	cfq_slice_expired(cfqd, 1);
3382

3383
	/*
3384
	 * workload type is changed, don't save slice, otherwise preempt
3385
	 * doesn't happen
3386
	 */
3387
	if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3388
		cfqq->cfqg->saved_workload_slice = 0;
3389

3390
	/*
3391
	 * Put the new queue at the front of the of the current list,
3392
	 * so we know that it will be selected next.
3393
	 */
3394
	BUG_ON(!cfq_cfqq_on_rr(cfqq));
3395

3396
	cfq_service_tree_add(cfqd, cfqq, 1);
3397

3398
	cfqq->slice_end = 0;
3399
	cfq_mark_cfqq_slice_new(cfqq);
3400
}
3401

3402
/*
3403
 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3404
 * something we should do about it
3405
 */
3406
static void
3407
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3408
		struct request *rq)
3409
{
3410
	struct cfq_io_context *cic = RQ_CIC(rq);
3411

3412
	cfqd->rq_queued++;
3413
	if (rq->cmd_flags & REQ_META)
3414
		cfqq->meta_pending++;
3415

3416
	cfq_update_io_thinktime(cfqd, cic);
3417
	cfq_update_io_seektime(cfqd, cfqq, rq);
3418
	cfq_update_idle_window(cfqd, cfqq, cic);
3419

3420
	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3421

3422
	if (cfqq == cfqd->active_queue) {
3423
		/*
3424
		 * Remember that we saw a request from this process, but
3425
		 * don't start queuing just yet. Otherwise we risk seeing lots
3426
		 * of tiny requests, because we disrupt the normal plugging
3427
		 * and merging. If the request is already larger than a single
3428
		 * page, let it rip immediately. For that case we assume that
3429
		 * merging is already done. Ditto for a busy system that
3430
		 * has other work pending, don't risk delaying until the
3431
		 * idle timer unplug to continue working.
3432
		 */
3433
		if (cfq_cfqq_wait_request(cfqq)) {
3434
			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3435
			    cfqd->busy_queues > 1) {
3436
				cfq_del_timer(cfqd, cfqq);
3437
				cfq_clear_cfqq_wait_request(cfqq);
3438
				__blk_run_queue(cfqd->queue);
3439
			} else {
3440
				cfq_blkiocg_update_idle_time_stats(
3441
						&cfqq->cfqg->blkg);
3442
				cfq_mark_cfqq_must_dispatch(cfqq);
3443
			}
3444
		}
3445
	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3446
		/*
3447
		 * not the active queue - expire current slice if it is
3448
		 * idle and has expired it's mean thinktime or this new queue
3449
		 * has some old slice time left and is of higher priority or
3450
		 * this new queue is RT and the current one is BE
3451
		 */
3452
		cfq_preempt_queue(cfqd, cfqq);
3453
		__blk_run_queue(cfqd->queue);
3454
	}
3455
}
3456

3457
static void cfq_insert_request(struct request_queue *q, struct request *rq)
3458
{
3459
	struct cfq_data *cfqd = q->elevator->elevator_data;
3460
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3461

3462
	cfq_log_cfqq(cfqd, cfqq, "insert_request");
3463
	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3464

3465
	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3466
	list_add_tail(&rq->queuelist, &cfqq->fifo);
3467
	cfq_add_rq_rb(rq);
3468
	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3469
			&cfqd->serving_group->blkg, rq_data_dir(rq),
3470
			rq_is_sync(rq));
3471
	cfq_rq_enqueued(cfqd, cfqq, rq);
3472
}
3473

3474
/*
3475
 * Update hw_tag based on peak queue depth over 50 samples under
3476
 * sufficient load.
3477
 */
3478
static void cfq_update_hw_tag(struct cfq_data *cfqd)
3479
{
3480
	struct cfq_queue *cfqq = cfqd->active_queue;
3481

3482
	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3483
		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3484

3485
	if (cfqd->hw_tag == 1)
3486
		return;
3487

3488
	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3489
	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3490
		return;
3491

3492
	/*
3493
	 * If active queue hasn't enough requests and can idle, cfq might not
3494
	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3495
	 * case
3496
	 */
3497
	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3498
	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3499
	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3500
		return;
3501

3502
	if (cfqd->hw_tag_samples++ < 50)
3503
		return;
3504

3505
	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3506
		cfqd->hw_tag = 1;
3507
	else
3508
		cfqd->hw_tag = 0;
3509
}
3510

3511
static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3512
{
3513
	struct cfq_io_context *cic = cfqd->active_cic;
3514

3515
	/* If the queue already has requests, don't wait */
3516
	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3517
		return false;
3518

3519
	/* If there are other queues in the group, don't wait */
3520
	if (cfqq->cfqg->nr_cfqq > 1)
3521
		return false;
3522

3523
	if (cfq_slice_used(cfqq))
3524
		return true;
3525

3526
	/* if slice left is less than think time, wait busy */
3527
	if (cic && sample_valid(cic->ttime_samples)
3528
	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
3529
		return true;
3530

3531
	/*
3532
	 * If think times is less than a jiffy than ttime_mean=0 and above
3533
	 * will not be true. It might happen that slice has not expired yet
3534
	 * but will expire soon (4-5 ns) during select_queue(). To cover the
3535
	 * case where think time is less than a jiffy, mark the queue wait
3536
	 * busy if only 1 jiffy is left in the slice.
3537
	 */
3538
	if (cfqq->slice_end - jiffies == 1)
3539
		return true;
3540

3541
	return false;
3542
}
3543

3544
static void cfq_completed_request(struct request_queue *q, struct request *rq)
3545
{
3546
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3547
	struct cfq_data *cfqd = cfqq->cfqd;
3548
	const int sync = rq_is_sync(rq);
3549
	unsigned long now;
3550

3551
	now = jiffies;
3552
	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3553
		     !!(rq->cmd_flags & REQ_NOIDLE));
3554

3555
	cfq_update_hw_tag(cfqd);
3556

3557
	WARN_ON(!cfqd->rq_in_driver);
3558
	WARN_ON(!cfqq->dispatched);
3559
	cfqd->rq_in_driver--;
3560
	cfqq->dispatched--;
3561
	(RQ_CFQG(rq))->dispatched--;
3562
	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3563
			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3564
			rq_data_dir(rq), rq_is_sync(rq));
3565

3566
	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3567

3568
	if (sync) {
3569
		RQ_CIC(rq)->last_end_request = now;
3570
		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3571
			cfqd->last_delayed_sync = now;
3572
	}
3573

3574
	/*
3575
	 * If this is the active queue, check if it needs to be expired,
3576
	 * or if we want to idle in case it has no pending requests.
3577
	 */
3578
	if (cfqd->active_queue == cfqq) {
3579
		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3580

3581
		if (cfq_cfqq_slice_new(cfqq)) {
3582
			cfq_set_prio_slice(cfqd, cfqq);
3583
			cfq_clear_cfqq_slice_new(cfqq);
3584
		}
3585

3586
		/*
3587
		 * Should we wait for next request to come in before we expire
3588
		 * the queue.
3589
		 */
3590
		if (cfq_should_wait_busy(cfqd, cfqq)) {
3591
			unsigned long extend_sl = cfqd->cfq_slice_idle;
3592
			if (!cfqd->cfq_slice_idle)
3593
				extend_sl = cfqd->cfq_group_idle;
3594
			cfqq->slice_end = jiffies + extend_sl;
3595
			cfq_mark_cfqq_wait_busy(cfqq);
3596
			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3597
		}
3598

3599
		/*
3600
		 * Idling is not enabled on:
3601
		 * - expired queues
3602
		 * - idle-priority queues
3603
		 * - async queues
3604
		 * - queues with still some requests queued
3605
		 * - when there is a close cooperator
3606
		 */
3607
		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3608
			cfq_slice_expired(cfqd, 1);
3609
		else if (sync && cfqq_empty &&
3610
			 !cfq_close_cooperator(cfqd, cfqq)) {
3611
			cfq_arm_slice_timer(cfqd);
3612
		}
3613
	}
3614

3615
	if (!cfqd->rq_in_driver)
3616
		cfq_schedule_dispatch(cfqd);
3617
}
3618

3619
/*
3620
 * we temporarily boost lower priority queues if they are holding fs exclusive
3621
 * resources. they are boosted to normal prio (CLASS_BE/4)
3622
 */
3623
static void cfq_prio_boost(struct cfq_queue *cfqq)
3624
{
3625
	if (has_fs_excl()) {
3626
		/*
3627
		 * boost idle prio on transactions that would lock out other
3628
		 * users of the filesystem
3629
		 */
3630
		if (cfq_class_idle(cfqq))
3631
			cfqq->ioprio_class = IOPRIO_CLASS_BE;
3632
		if (cfqq->ioprio > IOPRIO_NORM)
3633
			cfqq->ioprio = IOPRIO_NORM;
3634
	} else {
3635
		/*
3636
		 * unboost the queue (if needed)
3637
		 */
3638
		cfqq->ioprio_class = cfqq->org_ioprio_class;
3639
		cfqq->ioprio = cfqq->org_ioprio;
3640
	}
3641
}
3642

3643
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3644
{
3645
	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3646
		cfq_mark_cfqq_must_alloc_slice(cfqq);
3647
		return ELV_MQUEUE_MUST;
3648
	}
3649

3650
	return ELV_MQUEUE_MAY;
3651
}
3652

3653
static int cfq_may_queue(struct request_queue *q, int rw)
3654
{
3655
	struct cfq_data *cfqd = q->elevator->elevator_data;
3656
	struct task_struct *tsk = current;
3657
	struct cfq_io_context *cic;
3658
	struct cfq_queue *cfqq;
3659

3660
	/*
3661
	 * don't force setup of a queue from here, as a call to may_queue
3662
	 * does not necessarily imply that a request actually will be queued.
3663
	 * so just lookup a possibly existing queue, or return 'may queue'
3664
	 * if that fails
3665
	 */
3666
	cic = cfq_cic_lookup(cfqd, tsk->io_context);
3667
	if (!cic)
3668
		return ELV_MQUEUE_MAY;
3669

3670
	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3671
	if (cfqq) {
3672
		cfq_init_prio_data(cfqq, cic->ioc);
3673
		cfq_prio_boost(cfqq);
3674

3675
		return __cfq_may_queue(cfqq);
3676
	}
3677

3678
	return ELV_MQUEUE_MAY;
3679
}
3680

3681
/*
3682
 * queue lock held here
3683
 */
3684
static void cfq_put_request(struct request *rq)
3685
{
3686
	struct cfq_queue *cfqq = RQ_CFQQ(rq);
3687

3688
	if (cfqq) {
3689
		const int rw = rq_data_dir(rq);
3690

3691
		BUG_ON(!cfqq->allocated[rw]);
3692
		cfqq->allocated[rw]--;
3693

3694
		put_io_context(RQ_CIC(rq)->ioc);
3695

3696
		rq->elevator_private[0] = NULL;
3697
		rq->elevator_private[1] = NULL;
3698

3699
		/* Put down rq reference on cfqg */
3700
		cfq_put_cfqg(RQ_CFQG(rq));
3701
		rq->elevator_private[2] = NULL;
3702

3703
		cfq_put_queue(cfqq);
3704
	}
3705
}
3706

3707
static struct cfq_queue *
3708
cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3709
		struct cfq_queue *cfqq)
3710
{
3711
	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3712
	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3713
	cfq_mark_cfqq_coop(cfqq->new_cfqq);
3714
	cfq_put_queue(cfqq);
3715
	return cic_to_cfqq(cic, 1);
3716
}
3717

3718
/*
3719
 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3720
 * was the last process referring to said cfqq.
3721
 */
3722
static struct cfq_queue *
3723
split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3724
{
3725
	if (cfqq_process_refs(cfqq) == 1) {
3726
		cfqq->pid = current->pid;
3727
		cfq_clear_cfqq_coop(cfqq);
3728
		cfq_clear_cfqq_split_coop(cfqq);
3729
		return cfqq;
3730
	}
3731

3732
	cic_set_cfqq(cic, NULL, 1);
3733

3734
	cfq_put_cooperator(cfqq);
3735

3736
	cfq_put_queue(cfqq);
3737
	return NULL;
3738
}
3739
/*
3740
 * Allocate cfq data structures associated with this request.
3741
 */
3742
static int
3743
cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3744
{
3745
	struct cfq_data *cfqd = q->elevator->elevator_data;
3746
	struct cfq_io_context *cic;
3747
	const int rw = rq_data_dir(rq);
3748
	const bool is_sync = rq_is_sync(rq);
3749
	struct cfq_queue *cfqq;
3750
	unsigned long flags;
3751

3752
	might_sleep_if(gfp_mask & __GFP_WAIT);
3753

3754
	cic = cfq_get_io_context(cfqd, gfp_mask);
3755

3756
	spin_lock_irqsave(q->queue_lock, flags);
3757

3758
	if (!cic)
3759
		goto queue_fail;
3760

3761
new_queue:
3762
	cfqq = cic_to_cfqq(cic, is_sync);
3763
	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3764
		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3765
		cic_set_cfqq(cic, cfqq, is_sync);
3766
	} else {
3767
		/*
3768
		 * If the queue was seeky for too long, break it apart.
3769
		 */
3770
		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3771
			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3772
			cfqq = split_cfqq(cic, cfqq);
3773
			if (!cfqq)
3774
				goto new_queue;
3775
		}
3776

3777
		/*
3778
		 * Check to see if this queue is scheduled to merge with
3779
		 * another, closely cooperating queue.  The merging of
3780
		 * queues happens here as it must be done in process context.
3781
		 * The reference on new_cfqq was taken in merge_cfqqs.
3782
		 */
3783
		if (cfqq->new_cfqq)
3784
			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3785
	}
3786

3787
	cfqq->allocated[rw]++;
3788

3789
	cfqq->ref++;
3790
	rq->elevator_private[0] = cic;
3791
	rq->elevator_private[1] = cfqq;
3792
	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3793
	spin_unlock_irqrestore(q->queue_lock, flags);
3794
	return 0;
3795

3796
queue_fail:
3797
	cfq_schedule_dispatch(cfqd);
3798
	spin_unlock_irqrestore(q->queue_lock, flags);
3799
	cfq_log(cfqd, "set_request fail");
3800
	return 1;
3801
}
3802

3803
static void cfq_kick_queue(struct work_struct *work)
3804
{
3805
	struct cfq_data *cfqd =
3806
		container_of(work, struct cfq_data, unplug_work);
3807
	struct request_queue *q = cfqd->queue;
3808

3809
	spin_lock_irq(q->queue_lock);
3810
	__blk_run_queue(cfqd->queue);
3811
	spin_unlock_irq(q->queue_lock);
3812
}
3813

3814
/*
3815
 * Timer running if the active_queue is currently idling inside its time slice
3816
 */
3817
static void cfq_idle_slice_timer(unsigned long data)
3818
{
3819
	struct cfq_data *cfqd = (struct cfq_data *) data;
3820
	struct cfq_queue *cfqq;
3821
	unsigned long flags;
3822
	int timed_out = 1;
3823

3824
	cfq_log(cfqd, "idle timer fired");
3825

3826
	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3827

3828
	cfqq = cfqd->active_queue;
3829
	if (cfqq) {
3830
		timed_out = 0;
3831

3832
		/*
3833
		 * We saw a request before the queue expired, let it through
3834
		 */
3835
		if (cfq_cfqq_must_dispatch(cfqq))
3836
			goto out_kick;
3837

3838
		/*
3839
		 * expired
3840
		 */
3841
		if (cfq_slice_used(cfqq))
3842
			goto expire;
3843

3844
		/*
3845
		 * only expire and reinvoke request handler, if there are
3846
		 * other queues with pending requests
3847
		 */
3848
		if (!cfqd->busy_queues)
3849
			goto out_cont;
3850

3851
		/*
3852
		 * not expired and it has a request pending, let it dispatch
3853
		 */
3854
		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3855
			goto out_kick;
3856

3857
		/*
3858
		 * Queue depth flag is reset only when the idle didn't succeed
3859
		 */
3860
		cfq_clear_cfqq_deep(cfqq);
3861
	}
3862
expire:
3863
	cfq_slice_expired(cfqd, timed_out);
3864
out_kick:
3865
	cfq_schedule_dispatch(cfqd);
3866
out_cont:
3867
	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3868
}
3869

3870
static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3871
{
3872
	del_timer_sync(&cfqd->idle_slice_timer);
3873
	cancel_work_sync(&cfqd->unplug_work);
3874
}
3875

3876
static void cfq_put_async_queues(struct cfq_data *cfqd)
3877
{
3878
	int i;
3879

3880
	for (i = 0; i < IOPRIO_BE_NR; i++) {
3881
		if (cfqd->async_cfqq[0][i])
3882
			cfq_put_queue(cfqd->async_cfqq[0][i]);
3883
		if (cfqd->async_cfqq[1][i])
3884
			cfq_put_queue(cfqd->async_cfqq[1][i]);
3885
	}
3886

3887
	if (cfqd->async_idle_cfqq)
3888
		cfq_put_queue(cfqd->async_idle_cfqq);
3889
}
3890

3891
static void cfq_exit_queue(struct elevator_queue *e)
3892
{
3893
	struct cfq_data *cfqd = e->elevator_data;
3894
	struct request_queue *q = cfqd->queue;
3895
	bool wait = false;
3896

3897
	cfq_shutdown_timer_wq(cfqd);
3898

3899
	spin_lock_irq(q->queue_lock);
3900

3901
	if (cfqd->active_queue)
3902
		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3903

3904
	while (!list_empty(&cfqd->cic_list)) {
3905
		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3906
							struct cfq_io_context,
3907
							queue_list);
3908

3909
		__cfq_exit_single_io_context(cfqd, cic);
3910
	}
3911

3912
	cfq_put_async_queues(cfqd);
3913
	cfq_release_cfq_groups(cfqd);
3914

3915
	/*
3916
	 * If there are groups which we could not unlink from blkcg list,
3917
	 * wait for a rcu period for them to be freed.
3918
	 */
3919
	if (cfqd->nr_blkcg_linked_grps)
3920
		wait = true;
3921

3922
	spin_unlock_irq(q->queue_lock);
3923

3924
	cfq_shutdown_timer_wq(cfqd);
3925

3926
	spin_lock(&cic_index_lock);
3927
	ida_remove(&cic_index_ida, cfqd->cic_index);
3928
	spin_unlock(&cic_index_lock);
3929

3930
	/*
3931
	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3932
	 * Do this wait only if there are other unlinked groups out
3933
	 * there. This can happen if cgroup deletion path claimed the
3934
	 * responsibility of cleaning up a group before queue cleanup code
3935
	 * get to the group.
3936
	 *
3937
	 * Do not call synchronize_rcu() unconditionally as there are drivers
3938
	 * which create/delete request queue hundreds of times during scan/boot
3939
	 * and synchronize_rcu() can take significant time and slow down boot.
3940
	 */
3941
	if (wait)
3942
		synchronize_rcu();
3943

3944
#ifdef CONFIG_CFQ_GROUP_IOSCHED
3945
	/* Free up per cpu stats for root group */
3946
	free_percpu(cfqd->root_group.blkg.stats_cpu);
3947
#endif
3948
	kfree(cfqd);
3949
}
3950

3951
static int cfq_alloc_cic_index(void)
3952
{
3953
	int index, error;
3954

3955
	do {
3956
		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3957
			return -ENOMEM;
3958

3959
		spin_lock(&cic_index_lock);
3960
		error = ida_get_new(&cic_index_ida, &index);
3961
		spin_unlock(&cic_index_lock);
3962
		if (error && error != -EAGAIN)
3963
			return error;
3964
	} while (error);
3965

3966
	return index;
3967
}
3968

3969
static void *cfq_init_queue(struct request_queue *q)
3970
{
3971
	struct cfq_data *cfqd;
3972
	int i, j;
3973
	struct cfq_group *cfqg;
3974
	struct cfq_rb_root *st;
3975

3976
	i = cfq_alloc_cic_index();
3977
	if (i < 0)
3978
		return NULL;
3979

3980
	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3981
	if (!cfqd) {
3982
		spin_lock(&cic_index_lock);
3983
		ida_remove(&cic_index_ida, i);
3984
		spin_unlock(&cic_index_lock);
3985
		return NULL;
3986
	}
3987

3988
	/*
3989
	 * Don't need take queue_lock in the routine, since we are
3990
	 * initializing the ioscheduler, and nobody is using cfqd
3991
	 */
3992
	cfqd->cic_index = i;
3993

3994
	/* Init root service tree */
3995
	cfqd->grp_service_tree = CFQ_RB_ROOT;
3996

3997
	/* Init root group */
3998
	cfqg = &cfqd->root_group;
3999
	for_each_cfqg_st(cfqg, i, j, st)
4000
		*st = CFQ_RB_ROOT;
4001
	RB_CLEAR_NODE(&cfqg->rb_node);
4002

4003
	/* Give preference to root group over other groups */
4004
	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
4005

4006
#ifdef CONFIG_CFQ_GROUP_IOSCHED
4007
	/*
4008
	 * Set root group reference to 2. One reference will be dropped when
4009
	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
4010
	 * Other reference will remain there as we don't want to delete this
4011
	 * group as it is statically allocated and gets destroyed when
4012
	 * throtl_data goes away.
4013
	 */
4014
	cfqg->ref = 2;
4015

4016
	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4017
		kfree(cfqg);
4018
		kfree(cfqd);
4019
		return NULL;
4020
	}
4021

4022
	rcu_read_lock();
4023

4024
	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
4025
					(void *)cfqd, 0);
4026
	rcu_read_unlock();
4027
	cfqd->nr_blkcg_linked_grps++;
4028

4029
	/* Add group on cfqd->cfqg_list */
4030
	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
4031
#endif
4032
	/*
4033
	 * Not strictly needed (since RB_ROOT just clears the node and we
4034
	 * zeroed cfqd on alloc), but better be safe in case someone decides
4035
	 * to add magic to the rb code
4036
	 */
4037
	for (i = 0; i < CFQ_PRIO_LISTS; i++)
4038
		cfqd->prio_trees[i] = RB_ROOT;
4039

4040
	/*
4041
	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4042
	 * Grab a permanent reference to it, so that the normal code flow
4043
	 * will not attempt to free it.
4044
	 */
4045
	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4046
	cfqd->oom_cfqq.ref++;
4047
	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
4048

4049
	INIT_LIST_HEAD(&cfqd->cic_list);
4050

4051
	cfqd->queue = q;
4052

4053
	init_timer(&cfqd->idle_slice_timer);
4054
	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4055
	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4056

4057
	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4058

4059
	cfqd->cfq_quantum = cfq_quantum;
4060
	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4061
	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4062
	cfqd->cfq_back_max = cfq_back_max;
4063
	cfqd->cfq_back_penalty = cfq_back_penalty;
4064
	cfqd->cfq_slice[0] = cfq_slice_async;
4065
	cfqd->cfq_slice[1] = cfq_slice_sync;
4066
	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4067
	cfqd->cfq_slice_idle = cfq_slice_idle;
4068
	cfqd->cfq_group_idle = cfq_group_idle;
4069
	cfqd->cfq_latency = 1;
4070
	cfqd->hw_tag = -1;
4071
	/*
4072
	 * we optimistically start assuming sync ops weren't delayed in last
4073
	 * second, in order to have larger depth for async operations.
4074
	 */
4075
	cfqd->last_delayed_sync = jiffies - HZ;
4076
	return cfqd;
4077
}
4078

4079
static void cfq_slab_kill(void)
4080
{
4081
	/*
4082
	 * Caller already ensured that pending RCU callbacks are completed,
4083
	 * so we should have no busy allocations at this point.
4084
	 */
4085
	if (cfq_pool)
4086
		kmem_cache_destroy(cfq_pool);
4087
	if (cfq_ioc_pool)
4088
		kmem_cache_destroy(cfq_ioc_pool);
4089
}
4090

4091
static int __init cfq_slab_setup(void)
4092
{
4093
	cfq_pool = KMEM_CACHE(cfq_queue, 0);
4094
	if (!cfq_pool)
4095
		goto fail;
4096

4097
	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
4098
	if (!cfq_ioc_pool)
4099
		goto fail;
4100

4101
	return 0;
4102
fail:
4103
	cfq_slab_kill();
4104
	return -ENOMEM;
4105
}
4106

4107
/*
4108
 * sysfs parts below -->
4109
 */
4110
static ssize_t
4111
cfq_var_show(unsigned int var, char *page)
4112
{
4113
	return sprintf(page, "%d\n", var);
4114
}
4115

4116
static ssize_t
4117
cfq_var_store(unsigned int *var, const char *page, size_t count)
4118
{
4119
	char *p = (char *) page;
4120

4121
	*var = simple_strtoul(p, &p, 10);
4122
	return count;
4123
}
4124

4125
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
4126
static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
4127
{									\
4128
	struct cfq_data *cfqd = e->elevator_data;			\
4129
	unsigned int __data = __VAR;					\
4130
	if (__CONV)							\
4131
		__data = jiffies_to_msecs(__data);			\
4132
	return cfq_var_show(__data, (page));				\
4133
}
4134
SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4135
SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4136
SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4137
SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4138
SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4139
SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4140
SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4141
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4142
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4143
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4144
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4145
#undef SHOW_FUNCTION
4146

4147
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
4148
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
4149
{									\
4150
	struct cfq_data *cfqd = e->elevator_data;			\
4151
	unsigned int __data;						\
4152
	int ret = cfq_var_store(&__data, (page), count);		\
4153
	if (__data < (MIN))						\
4154
		__data = (MIN);						\
4155
	else if (__data > (MAX))					\
4156
		__data = (MAX);						\
4157
	if (__CONV)							\
4158
		*(__PTR) = msecs_to_jiffies(__data);			\
4159
	else								\
4160
		*(__PTR) = __data;					\
4161
	return ret;							\
4162
}
4163
STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4164
STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4165
		UINT_MAX, 1);
4166
STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4167
		UINT_MAX, 1);
4168
STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4169
STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4170
		UINT_MAX, 0);
4171
STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4172
STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4173
STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4174
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4175
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4176
		UINT_MAX, 0);
4177
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4178
#undef STORE_FUNCTION
4179

4180
#define CFQ_ATTR(name) \
4181
	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4182

4183
static struct elv_fs_entry cfq_attrs[] = {
4184
	CFQ_ATTR(quantum),
4185
	CFQ_ATTR(fifo_expire_sync),
4186
	CFQ_ATTR(fifo_expire_async),
4187
	CFQ_ATTR(back_seek_max),
4188
	CFQ_ATTR(back_seek_penalty),
4189
	CFQ_ATTR(slice_sync),
4190
	CFQ_ATTR(slice_async),
4191
	CFQ_ATTR(slice_async_rq),
4192
	CFQ_ATTR(slice_idle),
4193
	CFQ_ATTR(group_idle),
4194
	CFQ_ATTR(low_latency),
4195
	__ATTR_NULL
4196
};
4197

4198
static struct elevator_type iosched_cfq = {
4199
	.ops = {
4200
		.elevator_merge_fn = 		cfq_merge,
4201
		.elevator_merged_fn =		cfq_merged_request,
4202
		.elevator_merge_req_fn =	cfq_merged_requests,
4203
		.elevator_allow_merge_fn =	cfq_allow_merge,
4204
		.elevator_bio_merged_fn =	cfq_bio_merged,
4205
		.elevator_dispatch_fn =		cfq_dispatch_requests,
4206
		.elevator_add_req_fn =		cfq_insert_request,
4207
		.elevator_activate_req_fn =	cfq_activate_request,
4208
		.elevator_deactivate_req_fn =	cfq_deactivate_request,
4209
		.elevator_completed_req_fn =	cfq_completed_request,
4210
		.elevator_former_req_fn =	elv_rb_former_request,
4211
		.elevator_latter_req_fn =	elv_rb_latter_request,
4212
		.elevator_set_req_fn =		cfq_set_request,
4213
		.elevator_put_req_fn =		cfq_put_request,
4214
		.elevator_may_queue_fn =	cfq_may_queue,
4215
		.elevator_init_fn =		cfq_init_queue,
4216
		.elevator_exit_fn =		cfq_exit_queue,
4217
		.trim =				cfq_free_io_context,
4218
	},
4219
	.elevator_attrs =	cfq_attrs,
4220
	.elevator_name =	"cfq",
4221
	.elevator_owner =	THIS_MODULE,
4222
};
4223

4224
#ifdef CONFIG_CFQ_GROUP_IOSCHED
4225
static struct blkio_policy_type blkio_policy_cfq = {
4226
	.ops = {
4227
		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
4228
		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
4229
	},
4230
	.plid = BLKIO_POLICY_PROP,
4231
};
4232
#else
4233
static struct blkio_policy_type blkio_policy_cfq;
4234
#endif
4235

4236
static int __init cfq_init(void)
4237
{
4238
	/*
4239
	 * could be 0 on HZ < 1000 setups
4240
	 */
4241
	if (!cfq_slice_async)
4242
		cfq_slice_async = 1;
4243
	if (!cfq_slice_idle)
4244
		cfq_slice_idle = 1;
4245

4246
#ifdef CONFIG_CFQ_GROUP_IOSCHED
4247
	if (!cfq_group_idle)
4248
		cfq_group_idle = 1;
4249
#else
4250
		cfq_group_idle = 0;
4251
#endif
4252
	if (cfq_slab_setup())
4253
		return -ENOMEM;
4254

4255
	elv_register(&iosched_cfq);
4256
	blkio_policy_register(&blkio_policy_cfq);
4257

4258
	return 0;
4259
}
4260

4261
static void __exit cfq_exit(void)
4262
{
4263
	DECLARE_COMPLETION_ONSTACK(all_gone);
4264
	blkio_policy_unregister(&blkio_policy_cfq);
4265
	elv_unregister(&iosched_cfq);
4266
	ioc_gone = &all_gone;
4267
	/* ioc_gone's update must be visible before reading ioc_count */
4268
	smp_wmb();
4269

4270
	/*
4271
	 * this also protects us from entering cfq_slab_kill() with
4272
	 * pending RCU callbacks
4273
	 */
4274
	if (elv_ioc_count_read(cfq_ioc_count))
4275
		wait_for_completion(&all_gone);
4276
	ida_destroy(&cic_index_ida);
4277
	cfq_slab_kill();
4278
}
4279

4280
module_init(cfq_init);
4281
module_exit(cfq_exit);
4282

4283
MODULE_AUTHOR("Jens Axboe");
4284
MODULE_LICENSE("GPL");
4285
MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
4286

4287
Product

Resources

Company