CoCalc -- blk-throttle.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/block/blk-throttle.c
¹⁷³⁷² views
1
/*
2
 * Interface for controlling IO bandwidth on a request queue
3
 *
4
 * Copyright (C) 2010 Vivek Goyal <[email protected]>
5
 */
6

7
#include <linux/module.h>
8
#include <linux/slab.h>
9
#include <linux/blkdev.h>
10
#include <linux/bio.h>
11
#include <linux/blktrace_api.h>
12
#include "blk-cgroup.h"
13

14
/* Max dispatch from a group in 1 round */
15
static int throtl_grp_quantum = 8;
16

17
/* Total max dispatch from all groups in one round */
18
static int throtl_quantum = 32;
19

20
/* Throttling is performed over 100ms slice and after that slice is renewed */
21
static unsigned long throtl_slice = HZ/10;	/* 100 ms */
22

23
/* A workqueue to queue throttle related work */
24
static struct workqueue_struct *kthrotld_workqueue;
25
static void throtl_schedule_delayed_work(struct throtl_data *td,
26
				unsigned long delay);
27

28
struct throtl_rb_root {
29
	struct rb_root rb;
30
	struct rb_node *left;
31
	unsigned int count;
32
	unsigned long min_disptime;
33
};
34

35
#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
36
			.count = 0, .min_disptime = 0}
37

38
#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
39

40
struct throtl_grp {
41
	/* List of throtl groups on the request queue*/
42
	struct hlist_node tg_node;
43

44
	/* active throtl group service_tree member */
45
	struct rb_node rb_node;
46

47
	/*
48
	 * Dispatch time in jiffies. This is the estimated time when group
49
	 * will unthrottle and is ready to dispatch more bio. It is used as
50
	 * key to sort active groups in service tree.
51
	 */
52
	unsigned long disptime;
53

54
	struct blkio_group blkg;
55
	atomic_t ref;
56
	unsigned int flags;
57

58
	/* Two lists for READ and WRITE */
59
	struct bio_list bio_lists[2];
60

61
	/* Number of queued bios on READ and WRITE lists */
62
	unsigned int nr_queued[2];
63

64
	/* bytes per second rate limits */
65
	uint64_t bps[2];
66

67
	/* IOPS limits */
68
	unsigned int iops[2];
69

70
	/* Number of bytes disptached in current slice */
71
	uint64_t bytes_disp[2];
72
	/* Number of bio's dispatched in current slice */
73
	unsigned int io_disp[2];
74

75
	/* When did we start a new slice */
76
	unsigned long slice_start[2];
77
	unsigned long slice_end[2];
78

79
	/* Some throttle limits got updated for the group */
80
	int limits_changed;
81

82
	struct rcu_head rcu_head;
83
};
84

85
struct throtl_data
86
{
87
	/* List of throtl groups */
88
	struct hlist_head tg_list;
89

90
	/* service tree for active throtl groups */
91
	struct throtl_rb_root tg_service_tree;
92

93
	struct throtl_grp *root_tg;
94
	struct request_queue *queue;
95

96
	/* Total Number of queued bios on READ and WRITE lists */
97
	unsigned int nr_queued[2];
98

99
	/*
100
	 * number of total undestroyed groups
101
	 */
102
	unsigned int nr_undestroyed_grps;
103

104
	/* Work for dispatching throttled bios */
105
	struct delayed_work throtl_work;
106

107
	int limits_changed;
108
};
109

110
enum tg_state_flags {
111
	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
112
};
113

114
#define THROTL_TG_FNS(name)						\
115
static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
116
{									\
117
	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
118
}									\
119
static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
120
{									\
121
	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
122
}									\
123
static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
124
{									\
125
	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
126
}
127

128
THROTL_TG_FNS(on_rr);
129

130
#define throtl_log_tg(td, tg, fmt, args...)				\
131
	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
132
				blkg_path(&(tg)->blkg), ##args);      	\
133

134
#define throtl_log(td, fmt, args...)	\
135
	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
136

137
static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
138
{
139
	if (blkg)
140
		return container_of(blkg, struct throtl_grp, blkg);
141

142
	return NULL;
143
}
144

145
static inline int total_nr_queued(struct throtl_data *td)
146
{
147
	return (td->nr_queued[0] + td->nr_queued[1]);
148
}
149

150
static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
151
{
152
	atomic_inc(&tg->ref);
153
	return tg;
154
}
155

156
static void throtl_free_tg(struct rcu_head *head)
157
{
158
	struct throtl_grp *tg;
159

160
	tg = container_of(head, struct throtl_grp, rcu_head);
161
	free_percpu(tg->blkg.stats_cpu);
162
	kfree(tg);
163
}
164

165
static void throtl_put_tg(struct throtl_grp *tg)
166
{
167
	BUG_ON(atomic_read(&tg->ref) <= 0);
168
	if (!atomic_dec_and_test(&tg->ref))
169
		return;
170

171
	/*
172
	 * A group is freed in rcu manner. But having an rcu lock does not
173
	 * mean that one can access all the fields of blkg and assume these
174
	 * are valid. For example, don't try to follow throtl_data and
175
	 * request queue links.
176
	 *
177
	 * Having a reference to blkg under an rcu allows acess to only
178
	 * values local to groups like group stats and group rate limits
179
	 */
180
	call_rcu(&tg->rcu_head, throtl_free_tg);
181
}
182

183
static void throtl_init_group(struct throtl_grp *tg)
184
{
185
	INIT_HLIST_NODE(&tg->tg_node);
186
	RB_CLEAR_NODE(&tg->rb_node);
187
	bio_list_init(&tg->bio_lists[0]);
188
	bio_list_init(&tg->bio_lists[1]);
189
	tg->limits_changed = false;
190

191
	/* Practically unlimited BW */
192
	tg->bps[0] = tg->bps[1] = -1;
193
	tg->iops[0] = tg->iops[1] = -1;
194

195
	/*
196
	 * Take the initial reference that will be released on destroy
197
	 * This can be thought of a joint reference by cgroup and
198
	 * request queue which will be dropped by either request queue
199
	 * exit or cgroup deletion path depending on who is exiting first.
200
	 */
201
	atomic_set(&tg->ref, 1);
202
}
203

204
/* Should be called with rcu read lock held (needed for blkcg) */
205
static void
206
throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
207
{
208
	hlist_add_head(&tg->tg_node, &td->tg_list);
209
	td->nr_undestroyed_grps++;
210
}
211

212
static void
213
__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
214
{
215
	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
216
	unsigned int major, minor;
217

218
	if (!tg || tg->blkg.dev)
219
		return;
220

221
	/*
222
	 * Fill in device details for a group which might not have been
223
	 * filled at group creation time as queue was being instantiated
224
	 * and driver had not attached a device yet
225
	 */
226
	if (bdi->dev && dev_name(bdi->dev)) {
227
		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
228
		tg->blkg.dev = MKDEV(major, minor);
229
	}
230
}
231

232
/*
233
 * Should be called with without queue lock held. Here queue lock will be
234
 * taken rarely. It will be taken only once during life time of a group
235
 * if need be
236
 */
237
static void
238
throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
239
{
240
	if (!tg || tg->blkg.dev)
241
		return;
242

243
	spin_lock_irq(td->queue->queue_lock);
244
	__throtl_tg_fill_dev_details(td, tg);
245
	spin_unlock_irq(td->queue->queue_lock);
246
}
247

248
static void throtl_init_add_tg_lists(struct throtl_data *td,
249
			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
250
{
251
	__throtl_tg_fill_dev_details(td, tg);
252

253
	/* Add group onto cgroup list */
254
	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
255
				tg->blkg.dev, BLKIO_POLICY_THROTL);
256

257
	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
258
	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
259
	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
260
	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
261

262
	throtl_add_group_to_td_list(td, tg);
263
}
264

265
/* Should be called without queue lock and outside of rcu period */
266
static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
267
{
268
	struct throtl_grp *tg = NULL;
269
	int ret;
270

271
	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
272
	if (!tg)
273
		return NULL;
274

275
	ret = blkio_alloc_blkg_stats(&tg->blkg);
276

277
	if (ret) {
278
		kfree(tg);
279
		return NULL;
280
	}
281

282
	throtl_init_group(tg);
283
	return tg;
284
}
285

286
static struct
287
throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
288
{
289
	struct throtl_grp *tg = NULL;
290
	void *key = td;
291

292
	/*
293
	 * This is the common case when there are no blkio cgroups.
294
 	 * Avoid lookup in this case
295
 	 */
296
	if (blkcg == &blkio_root_cgroup)
297
		tg = td->root_tg;
298
	else
299
		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
300

301
	__throtl_tg_fill_dev_details(td, tg);
302
	return tg;
303
}
304

305
/*
306
 * This function returns with queue lock unlocked in case of error, like
307
 * request queue is no more
308
 */
309
static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
310
{
311
	struct throtl_grp *tg = NULL, *__tg = NULL;
312
	struct blkio_cgroup *blkcg;
313
	struct request_queue *q = td->queue;
314

315
	rcu_read_lock();
316
	blkcg = task_blkio_cgroup(current);
317
	tg = throtl_find_tg(td, blkcg);
318
	if (tg) {
319
		rcu_read_unlock();
320
		return tg;
321
	}
322

323
	/*
324
	 * Need to allocate a group. Allocation of group also needs allocation
325
	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
326
	 * we need to drop rcu lock and queue_lock before we call alloc
327
	 *
328
	 * Take the request queue reference to make sure queue does not
329
	 * go away once we return from allocation.
330
	 */
331
	blk_get_queue(q);
332
	rcu_read_unlock();
333
	spin_unlock_irq(q->queue_lock);
334

335
	tg = throtl_alloc_tg(td);
336
	/*
337
	 * We might have slept in group allocation. Make sure queue is not
338
	 * dead
339
	 */
340
	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
341
		blk_put_queue(q);
342
		if (tg)
343
			kfree(tg);
344

345
		return ERR_PTR(-ENODEV);
346
	}
347
	blk_put_queue(q);
348

349
	/* Group allocated and queue is still alive. take the lock */
350
	spin_lock_irq(q->queue_lock);
351

352
	/*
353
	 * Initialize the new group. After sleeping, read the blkcg again.
354
	 */
355
	rcu_read_lock();
356
	blkcg = task_blkio_cgroup(current);
357

358
	/*
359
	 * If some other thread already allocated the group while we were
360
	 * not holding queue lock, free up the group
361
	 */
362
	__tg = throtl_find_tg(td, blkcg);
363

364
	if (__tg) {
365
		kfree(tg);
366
		rcu_read_unlock();
367
		return __tg;
368
	}
369

370
	/* Group allocation failed. Account the IO to root group */
371
	if (!tg) {
372
		tg = td->root_tg;
373
		return tg;
374
	}
375

376
	throtl_init_add_tg_lists(td, tg, blkcg);
377
	rcu_read_unlock();
378
	return tg;
379
}
380

381
static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
382
{
383
	/* Service tree is empty */
384
	if (!root->count)
385
		return NULL;
386

387
	if (!root->left)
388
		root->left = rb_first(&root->rb);
389

390
	if (root->left)
391
		return rb_entry_tg(root->left);
392

393
	return NULL;
394
}
395

396
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
397
{
398
	rb_erase(n, root);
399
	RB_CLEAR_NODE(n);
400
}
401

402
static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
403
{
404
	if (root->left == n)
405
		root->left = NULL;
406
	rb_erase_init(n, &root->rb);
407
	--root->count;
408
}
409

410
static void update_min_dispatch_time(struct throtl_rb_root *st)
411
{
412
	struct throtl_grp *tg;
413

414
	tg = throtl_rb_first(st);
415
	if (!tg)
416
		return;
417

418
	st->min_disptime = tg->disptime;
419
}
420

421
static void
422
tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
423
{
424
	struct rb_node **node = &st->rb.rb_node;
425
	struct rb_node *parent = NULL;
426
	struct throtl_grp *__tg;
427
	unsigned long key = tg->disptime;
428
	int left = 1;
429

430
	while (*node != NULL) {
431
		parent = *node;
432
		__tg = rb_entry_tg(parent);
433

434
		if (time_before(key, __tg->disptime))
435
			node = &parent->rb_left;
436
		else {
437
			node = &parent->rb_right;
438
			left = 0;
439
		}
440
	}
441

442
	if (left)
443
		st->left = &tg->rb_node;
444

445
	rb_link_node(&tg->rb_node, parent, node);
446
	rb_insert_color(&tg->rb_node, &st->rb);
447
}
448

449
static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
450
{
451
	struct throtl_rb_root *st = &td->tg_service_tree;
452

453
	tg_service_tree_add(st, tg);
454
	throtl_mark_tg_on_rr(tg);
455
	st->count++;
456
}
457

458
static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
459
{
460
	if (!throtl_tg_on_rr(tg))
461
		__throtl_enqueue_tg(td, tg);
462
}
463

464
static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
465
{
466
	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
467
	throtl_clear_tg_on_rr(tg);
468
}
469

470
static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
471
{
472
	if (throtl_tg_on_rr(tg))
473
		__throtl_dequeue_tg(td, tg);
474
}
475

476
static void throtl_schedule_next_dispatch(struct throtl_data *td)
477
{
478
	struct throtl_rb_root *st = &td->tg_service_tree;
479

480
	/*
481
	 * If there are more bios pending, schedule more work.
482
	 */
483
	if (!total_nr_queued(td))
484
		return;
485

486
	BUG_ON(!st->count);
487

488
	update_min_dispatch_time(st);
489

490
	if (time_before_eq(st->min_disptime, jiffies))
491
		throtl_schedule_delayed_work(td, 0);
492
	else
493
		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
494
}
495

496
static inline void
497
throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
498
{
499
	tg->bytes_disp[rw] = 0;
500
	tg->io_disp[rw] = 0;
501
	tg->slice_start[rw] = jiffies;
502
	tg->slice_end[rw] = jiffies + throtl_slice;
503
	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
504
			rw == READ ? 'R' : 'W', tg->slice_start[rw],
505
			tg->slice_end[rw], jiffies);
506
}
507

508
static inline void throtl_set_slice_end(struct throtl_data *td,
509
		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
510
{
511
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
512
}
513

514
static inline void throtl_extend_slice(struct throtl_data *td,
515
		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
516
{
517
	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
518
	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
519
			rw == READ ? 'R' : 'W', tg->slice_start[rw],
520
			tg->slice_end[rw], jiffies);
521
}
522

523
/* Determine if previously allocated or extended slice is complete or not */
524
static bool
525
throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
526
{
527
	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
528
		return 0;
529

530
	return 1;
531
}
532

533
/* Trim the used slices and adjust slice start accordingly */
534
static inline void
535
throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
536
{
537
	unsigned long nr_slices, time_elapsed, io_trim;
538
	u64 bytes_trim, tmp;
539

540
	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
541

542
	/*
543
	 * If bps are unlimited (-1), then time slice don't get
544
	 * renewed. Don't try to trim the slice if slice is used. A new
545
	 * slice will start when appropriate.
546
	 */
547
	if (throtl_slice_used(td, tg, rw))
548
		return;
549

550
	/*
551
	 * A bio has been dispatched. Also adjust slice_end. It might happen
552
	 * that initially cgroup limit was very low resulting in high
553
	 * slice_end, but later limit was bumped up and bio was dispached
554
	 * sooner, then we need to reduce slice_end. A high bogus slice_end
555
	 * is bad because it does not allow new slice to start.
556
	 */
557

558
	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
559

560
	time_elapsed = jiffies - tg->slice_start[rw];
561

562
	nr_slices = time_elapsed / throtl_slice;
563

564
	if (!nr_slices)
565
		return;
566
	tmp = tg->bps[rw] * throtl_slice * nr_slices;
567
	do_div(tmp, HZ);
568
	bytes_trim = tmp;
569

570
	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
571

572
	if (!bytes_trim && !io_trim)
573
		return;
574

575
	if (tg->bytes_disp[rw] >= bytes_trim)
576
		tg->bytes_disp[rw] -= bytes_trim;
577
	else
578
		tg->bytes_disp[rw] = 0;
579

580
	if (tg->io_disp[rw] >= io_trim)
581
		tg->io_disp[rw] -= io_trim;
582
	else
583
		tg->io_disp[rw] = 0;
584

585
	tg->slice_start[rw] += nr_slices * throtl_slice;
586

587
	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
588
			" start=%lu end=%lu jiffies=%lu",
589
			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
590
			tg->slice_start[rw], tg->slice_end[rw], jiffies);
591
}
592

593
static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
594
		struct bio *bio, unsigned long *wait)
595
{
596
	bool rw = bio_data_dir(bio);
597
	unsigned int io_allowed;
598
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
599
	u64 tmp;
600

601
	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
602

603
	/* Slice has just started. Consider one slice interval */
604
	if (!jiffy_elapsed)
605
		jiffy_elapsed_rnd = throtl_slice;
606

607
	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
608

609
	/*
610
	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
611
	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
612
	 * will allow dispatch after 1 second and after that slice should
613
	 * have been trimmed.
614
	 */
615

616
	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
617
	do_div(tmp, HZ);
618

619
	if (tmp > UINT_MAX)
620
		io_allowed = UINT_MAX;
621
	else
622
		io_allowed = tmp;
623

624
	if (tg->io_disp[rw] + 1 <= io_allowed) {
625
		if (wait)
626
			*wait = 0;
627
		return 1;
628
	}
629

630
	/* Calc approx time to dispatch */
631
	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
632

633
	if (jiffy_wait > jiffy_elapsed)
634
		jiffy_wait = jiffy_wait - jiffy_elapsed;
635
	else
636
		jiffy_wait = 1;
637

638
	if (wait)
639
		*wait = jiffy_wait;
640
	return 0;
641
}
642

643
static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
644
		struct bio *bio, unsigned long *wait)
645
{
646
	bool rw = bio_data_dir(bio);
647
	u64 bytes_allowed, extra_bytes, tmp;
648
	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
649

650
	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
651

652
	/* Slice has just started. Consider one slice interval */
653
	if (!jiffy_elapsed)
654
		jiffy_elapsed_rnd = throtl_slice;
655

656
	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
657

658
	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
659
	do_div(tmp, HZ);
660
	bytes_allowed = tmp;
661

662
	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
663
		if (wait)
664
			*wait = 0;
665
		return 1;
666
	}
667

668
	/* Calc approx time to dispatch */
669
	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
670
	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
671

672
	if (!jiffy_wait)
673
		jiffy_wait = 1;
674

675
	/*
676
	 * This wait time is without taking into consideration the rounding
677
	 * up we did. Add that time also.
678
	 */
679
	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
680
	if (wait)
681
		*wait = jiffy_wait;
682
	return 0;
683
}
684

685
static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
686
	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
687
		return 1;
688
	return 0;
689
}
690

691
/*
692
 * Returns whether one can dispatch a bio or not. Also returns approx number
693
 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
694
 */
695
static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
696
				struct bio *bio, unsigned long *wait)
697
{
698
	bool rw = bio_data_dir(bio);
699
	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
700

701
	/*
702
 	 * Currently whole state machine of group depends on first bio
703
	 * queued in the group bio list. So one should not be calling
704
	 * this function with a different bio if there are other bios
705
	 * queued.
706
	 */
707
	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
708

709
	/* If tg->bps = -1, then BW is unlimited */
710
	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
711
		if (wait)
712
			*wait = 0;
713
		return 1;
714
	}
715

716
	/*
717
	 * If previous slice expired, start a new one otherwise renew/extend
718
	 * existing slice to make sure it is at least throtl_slice interval
719
	 * long since now.
720
	 */
721
	if (throtl_slice_used(td, tg, rw))
722
		throtl_start_new_slice(td, tg, rw);
723
	else {
724
		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
725
			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
726
	}
727

728
	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
729
	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
730
		if (wait)
731
			*wait = 0;
732
		return 1;
733
	}
734

735
	max_wait = max(bps_wait, iops_wait);
736

737
	if (wait)
738
		*wait = max_wait;
739

740
	if (time_before(tg->slice_end[rw], jiffies + max_wait))
741
		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
742

743
	return 0;
744
}
745

746
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
747
{
748
	bool rw = bio_data_dir(bio);
749
	bool sync = bio->bi_rw & REQ_SYNC;
750

751
	/* Charge the bio to the group */
752
	tg->bytes_disp[rw] += bio->bi_size;
753
	tg->io_disp[rw]++;
754

755
	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
756
}
757

758
static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
759
			struct bio *bio)
760
{
761
	bool rw = bio_data_dir(bio);
762

763
	bio_list_add(&tg->bio_lists[rw], bio);
764
	/* Take a bio reference on tg */
765
	throtl_ref_get_tg(tg);
766
	tg->nr_queued[rw]++;
767
	td->nr_queued[rw]++;
768
	throtl_enqueue_tg(td, tg);
769
}
770

771
static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
772
{
773
	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
774
	struct bio *bio;
775

776
	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
777
		tg_may_dispatch(td, tg, bio, &read_wait);
778

779
	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
780
		tg_may_dispatch(td, tg, bio, &write_wait);
781

782
	min_wait = min(read_wait, write_wait);
783
	disptime = jiffies + min_wait;
784

785
	/* Update dispatch time */
786
	throtl_dequeue_tg(td, tg);
787
	tg->disptime = disptime;
788
	throtl_enqueue_tg(td, tg);
789
}
790

791
static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
792
				bool rw, struct bio_list *bl)
793
{
794
	struct bio *bio;
795

796
	bio = bio_list_pop(&tg->bio_lists[rw]);
797
	tg->nr_queued[rw]--;
798
	/* Drop bio reference on tg */
799
	throtl_put_tg(tg);
800

801
	BUG_ON(td->nr_queued[rw] <= 0);
802
	td->nr_queued[rw]--;
803

804
	throtl_charge_bio(tg, bio);
805
	bio_list_add(bl, bio);
806
	bio->bi_rw |= REQ_THROTTLED;
807

808
	throtl_trim_slice(td, tg, rw);
809
}
810

811
static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
812
				struct bio_list *bl)
813
{
814
	unsigned int nr_reads = 0, nr_writes = 0;
815
	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
816
	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
817
	struct bio *bio;
818

819
	/* Try to dispatch 75% READS and 25% WRITES */
820

821
	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
822
		&& tg_may_dispatch(td, tg, bio, NULL)) {
823

824
		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
825
		nr_reads++;
826

827
		if (nr_reads >= max_nr_reads)
828
			break;
829
	}
830

831
	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
832
		&& tg_may_dispatch(td, tg, bio, NULL)) {
833

834
		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
835
		nr_writes++;
836

837
		if (nr_writes >= max_nr_writes)
838
			break;
839
	}
840

841
	return nr_reads + nr_writes;
842
}
843

844
static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
845
{
846
	unsigned int nr_disp = 0;
847
	struct throtl_grp *tg;
848
	struct throtl_rb_root *st = &td->tg_service_tree;
849

850
	while (1) {
851
		tg = throtl_rb_first(st);
852

853
		if (!tg)
854
			break;
855

856
		if (time_before(jiffies, tg->disptime))
857
			break;
858

859
		throtl_dequeue_tg(td, tg);
860

861
		nr_disp += throtl_dispatch_tg(td, tg, bl);
862

863
		if (tg->nr_queued[0] || tg->nr_queued[1]) {
864
			tg_update_disptime(td, tg);
865
			throtl_enqueue_tg(td, tg);
866
		}
867

868
		if (nr_disp >= throtl_quantum)
869
			break;
870
	}
871

872
	return nr_disp;
873
}
874

875
static void throtl_process_limit_change(struct throtl_data *td)
876
{
877
	struct throtl_grp *tg;
878
	struct hlist_node *pos, *n;
879

880
	if (!td->limits_changed)
881
		return;
882

883
	xchg(&td->limits_changed, false);
884

885
	throtl_log(td, "limits changed");
886

887
	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
888
		if (!tg->limits_changed)
889
			continue;
890

891
		if (!xchg(&tg->limits_changed, false))
892
			continue;
893

894
		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
895
			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
896
			tg->iops[READ], tg->iops[WRITE]);
897

898
		/*
899
		 * Restart the slices for both READ and WRITES. It
900
		 * might happen that a group's limit are dropped
901
		 * suddenly and we don't want to account recently
902
		 * dispatched IO with new low rate
903
		 */
904
		throtl_start_new_slice(td, tg, 0);
905
		throtl_start_new_slice(td, tg, 1);
906

907
		if (throtl_tg_on_rr(tg))
908
			tg_update_disptime(td, tg);
909
	}
910
}
911

912
/* Dispatch throttled bios. Should be called without queue lock held. */
913
static int throtl_dispatch(struct request_queue *q)
914
{
915
	struct throtl_data *td = q->td;
916
	unsigned int nr_disp = 0;
917
	struct bio_list bio_list_on_stack;
918
	struct bio *bio;
919
	struct blk_plug plug;
920

921
	spin_lock_irq(q->queue_lock);
922

923
	throtl_process_limit_change(td);
924

925
	if (!total_nr_queued(td))
926
		goto out;
927

928
	bio_list_init(&bio_list_on_stack);
929

930
	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
931
			total_nr_queued(td), td->nr_queued[READ],
932
			td->nr_queued[WRITE]);
933

934
	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
935

936
	if (nr_disp)
937
		throtl_log(td, "bios disp=%u", nr_disp);
938

939
	throtl_schedule_next_dispatch(td);
940
out:
941
	spin_unlock_irq(q->queue_lock);
942

943
	/*
944
	 * If we dispatched some requests, unplug the queue to make sure
945
	 * immediate dispatch
946
	 */
947
	if (nr_disp) {
948
		blk_start_plug(&plug);
949
		while((bio = bio_list_pop(&bio_list_on_stack)))
950
			generic_make_request(bio);
951
		blk_finish_plug(&plug);
952
	}
953
	return nr_disp;
954
}
955

956
void blk_throtl_work(struct work_struct *work)
957
{
958
	struct throtl_data *td = container_of(work, struct throtl_data,
959
					throtl_work.work);
960
	struct request_queue *q = td->queue;
961

962
	throtl_dispatch(q);
963
}
964

965
/* Call with queue lock held */
966
static void
967
throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
968
{
969

970
	struct delayed_work *dwork = &td->throtl_work;
971

972
	/* schedule work if limits changed even if no bio is queued */
973
	if (total_nr_queued(td) > 0 || td->limits_changed) {
974
		/*
975
		 * We might have a work scheduled to be executed in future.
976
		 * Cancel that and schedule a new one.
977
		 */
978
		__cancel_delayed_work(dwork);
979
		queue_delayed_work(kthrotld_workqueue, dwork, delay);
980
		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
981
				delay, jiffies);
982
	}
983
}
984

985
static void
986
throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
987
{
988
	/* Something wrong if we are trying to remove same group twice */
989
	BUG_ON(hlist_unhashed(&tg->tg_node));
990

991
	hlist_del_init(&tg->tg_node);
992

993
	/*
994
	 * Put the reference taken at the time of creation so that when all
995
	 * queues are gone, group can be destroyed.
996
	 */
997
	throtl_put_tg(tg);
998
	td->nr_undestroyed_grps--;
999
}
1000

1001
static void throtl_release_tgs(struct throtl_data *td)
1002
{
1003
	struct hlist_node *pos, *n;
1004
	struct throtl_grp *tg;
1005

1006
	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
1007
		/*
1008
		 * If cgroup removal path got to blk_group first and removed
1009
		 * it from cgroup list, then it will take care of destroying
1010
		 * cfqg also.
1011
		 */
1012
		if (!blkiocg_del_blkio_group(&tg->blkg))
1013
			throtl_destroy_tg(td, tg);
1014
	}
1015
}
1016

1017
static void throtl_td_free(struct throtl_data *td)
1018
{
1019
	kfree(td);
1020
}
1021

1022
/*
1023
 * Blk cgroup controller notification saying that blkio_group object is being
1024
 * delinked as associated cgroup object is going away. That also means that
1025
 * no new IO will come in this group. So get rid of this group as soon as
1026
 * any pending IO in the group is finished.
1027
 *
1028
 * This function is called under rcu_read_lock(). key is the rcu protected
1029
 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1030
 * rcu read lock.
1031
 *
1032
 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1033
 * it should not be NULL as even if queue was going away, cgroup deltion
1034
 * path got to it first.
1035
 */
1036
void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
1037
{
1038
	unsigned long flags;
1039
	struct throtl_data *td = key;
1040

1041
	spin_lock_irqsave(td->queue->queue_lock, flags);
1042
	throtl_destroy_tg(td, tg_of_blkg(blkg));
1043
	spin_unlock_irqrestore(td->queue->queue_lock, flags);
1044
}
1045

1046
static void throtl_update_blkio_group_common(struct throtl_data *td,
1047
				struct throtl_grp *tg)
1048
{
1049
	xchg(&tg->limits_changed, true);
1050
	xchg(&td->limits_changed, true);
1051
	/* Schedule a work now to process the limit change */
1052
	throtl_schedule_delayed_work(td, 0);
1053
}
1054

1055
/*
1056
 * For all update functions, key should be a valid pointer because these
1057
 * update functions are called under blkcg_lock, that means, blkg is
1058
 * valid and in turn key is valid. queue exit path can not race because
1059
 * of blkcg_lock
1060
 *
1061
 * Can not take queue lock in update functions as queue lock under blkcg_lock
1062
 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1063
 */
1064
static void throtl_update_blkio_group_read_bps(void *key,
1065
				struct blkio_group *blkg, u64 read_bps)
1066
{
1067
	struct throtl_data *td = key;
1068
	struct throtl_grp *tg = tg_of_blkg(blkg);
1069

1070
	tg->bps[READ] = read_bps;
1071
	throtl_update_blkio_group_common(td, tg);
1072
}
1073

1074
static void throtl_update_blkio_group_write_bps(void *key,
1075
				struct blkio_group *blkg, u64 write_bps)
1076
{
1077
	struct throtl_data *td = key;
1078
	struct throtl_grp *tg = tg_of_blkg(blkg);
1079

1080
	tg->bps[WRITE] = write_bps;
1081
	throtl_update_blkio_group_common(td, tg);
1082
}
1083

1084
static void throtl_update_blkio_group_read_iops(void *key,
1085
			struct blkio_group *blkg, unsigned int read_iops)
1086
{
1087
	struct throtl_data *td = key;
1088
	struct throtl_grp *tg = tg_of_blkg(blkg);
1089

1090
	tg->iops[READ] = read_iops;
1091
	throtl_update_blkio_group_common(td, tg);
1092
}
1093

1094
static void throtl_update_blkio_group_write_iops(void *key,
1095
			struct blkio_group *blkg, unsigned int write_iops)
1096
{
1097
	struct throtl_data *td = key;
1098
	struct throtl_grp *tg = tg_of_blkg(blkg);
1099

1100
	tg->iops[WRITE] = write_iops;
1101
	throtl_update_blkio_group_common(td, tg);
1102
}
1103

1104
static void throtl_shutdown_wq(struct request_queue *q)
1105
{
1106
	struct throtl_data *td = q->td;
1107

1108
	cancel_delayed_work_sync(&td->throtl_work);
1109
}
1110

1111
static struct blkio_policy_type blkio_policy_throtl = {
1112
	.ops = {
1113
		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
1114
		.blkio_update_group_read_bps_fn =
1115
					throtl_update_blkio_group_read_bps,
1116
		.blkio_update_group_write_bps_fn =
1117
					throtl_update_blkio_group_write_bps,
1118
		.blkio_update_group_read_iops_fn =
1119
					throtl_update_blkio_group_read_iops,
1120
		.blkio_update_group_write_iops_fn =
1121
					throtl_update_blkio_group_write_iops,
1122
	},
1123
	.plid = BLKIO_POLICY_THROTL,
1124
};
1125

1126
int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1127
{
1128
	struct throtl_data *td = q->td;
1129
	struct throtl_grp *tg;
1130
	struct bio *bio = *biop;
1131
	bool rw = bio_data_dir(bio), update_disptime = true;
1132
	struct blkio_cgroup *blkcg;
1133

1134
	if (bio->bi_rw & REQ_THROTTLED) {
1135
		bio->bi_rw &= ~REQ_THROTTLED;
1136
		return 0;
1137
	}
1138

1139
	/*
1140
	 * A throtl_grp pointer retrieved under rcu can be used to access
1141
	 * basic fields like stats and io rates. If a group has no rules,
1142
	 * just update the dispatch stats in lockless manner and return.
1143
	 */
1144

1145
	rcu_read_lock();
1146
	blkcg = task_blkio_cgroup(current);
1147
	tg = throtl_find_tg(td, blkcg);
1148
	if (tg) {
1149
		throtl_tg_fill_dev_details(td, tg);
1150

1151
		if (tg_no_rule_group(tg, rw)) {
1152
			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
1153
					rw, bio->bi_rw & REQ_SYNC);
1154
			rcu_read_unlock();
1155
			return 0;
1156
		}
1157
	}
1158
	rcu_read_unlock();
1159

1160
	/*
1161
	 * Either group has not been allocated yet or it is not an unlimited
1162
	 * IO group
1163
	 */
1164

1165
	spin_lock_irq(q->queue_lock);
1166
	tg = throtl_get_tg(td);
1167

1168
	if (IS_ERR(tg)) {
1169
		if (PTR_ERR(tg)	== -ENODEV) {
1170
			/*
1171
			 * Queue is gone. No queue lock held here.
1172
			 */
1173
			return -ENODEV;
1174
		}
1175
	}
1176

1177
	if (tg->nr_queued[rw]) {
1178
		/*
1179
		 * There is already another bio queued in same dir. No
1180
		 * need to update dispatch time.
1181
		 */
1182
		update_disptime = false;
1183
		goto queue_bio;
1184

1185
	}
1186

1187
	/* Bio is with-in rate limit of group */
1188
	if (tg_may_dispatch(td, tg, bio, NULL)) {
1189
		throtl_charge_bio(tg, bio);
1190

1191
		/*
1192
		 * We need to trim slice even when bios are not being queued
1193
		 * otherwise it might happen that a bio is not queued for
1194
		 * a long time and slice keeps on extending and trim is not
1195
		 * called for a long time. Now if limits are reduced suddenly
1196
		 * we take into account all the IO dispatched so far at new
1197
		 * low rate and * newly queued IO gets a really long dispatch
1198
		 * time.
1199
		 *
1200
		 * So keep on trimming slice even if bio is not queued.
1201
		 */
1202
		throtl_trim_slice(td, tg, rw);
1203
		goto out;
1204
	}
1205

1206
queue_bio:
1207
	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1208
			" iodisp=%u iops=%u queued=%d/%d",
1209
			rw == READ ? 'R' : 'W',
1210
			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1211
			tg->io_disp[rw], tg->iops[rw],
1212
			tg->nr_queued[READ], tg->nr_queued[WRITE]);
1213

1214
	throtl_add_bio_tg(q->td, tg, bio);
1215
	*biop = NULL;
1216

1217
	if (update_disptime) {
1218
		tg_update_disptime(td, tg);
1219
		throtl_schedule_next_dispatch(td);
1220
	}
1221

1222
out:
1223
	spin_unlock_irq(q->queue_lock);
1224
	return 0;
1225
}
1226

1227
int blk_throtl_init(struct request_queue *q)
1228
{
1229
	struct throtl_data *td;
1230
	struct throtl_grp *tg;
1231

1232
	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1233
	if (!td)
1234
		return -ENOMEM;
1235

1236
	INIT_HLIST_HEAD(&td->tg_list);
1237
	td->tg_service_tree = THROTL_RB_ROOT;
1238
	td->limits_changed = false;
1239
	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1240

1241
	/* alloc and Init root group. */
1242
	td->queue = q;
1243
	tg = throtl_alloc_tg(td);
1244

1245
	if (!tg) {
1246
		kfree(td);
1247
		return -ENOMEM;
1248
	}
1249

1250
	td->root_tg = tg;
1251

1252
	rcu_read_lock();
1253
	throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1254
	rcu_read_unlock();
1255

1256
	/* Attach throtl data to request queue */
1257
	q->td = td;
1258
	return 0;
1259
}
1260

1261
void blk_throtl_exit(struct request_queue *q)
1262
{
1263
	struct throtl_data *td = q->td;
1264
	bool wait = false;
1265

1266
	BUG_ON(!td);
1267

1268
	throtl_shutdown_wq(q);
1269

1270
	spin_lock_irq(q->queue_lock);
1271
	throtl_release_tgs(td);
1272

1273
	/* If there are other groups */
1274
	if (td->nr_undestroyed_grps > 0)
1275
		wait = true;
1276

1277
	spin_unlock_irq(q->queue_lock);
1278

1279
	/*
1280
	 * Wait for tg->blkg->key accessors to exit their grace periods.
1281
	 * Do this wait only if there are other undestroyed groups out
1282
	 * there (other than root group). This can happen if cgroup deletion
1283
	 * path claimed the responsibility of cleaning up a group before
1284
	 * queue cleanup code get to the group.
1285
	 *
1286
	 * Do not call synchronize_rcu() unconditionally as there are drivers
1287
	 * which create/delete request queue hundreds of times during scan/boot
1288
	 * and synchronize_rcu() can take significant time and slow down boot.
1289
	 */
1290
	if (wait)
1291
		synchronize_rcu();
1292

1293
	/*
1294
	 * Just being safe to make sure after previous flush if some body did
1295
	 * update limits through cgroup and another work got queued, cancel
1296
	 * it.
1297
	 */
1298
	throtl_shutdown_wq(q);
1299
	throtl_td_free(td);
1300
}
1301

1302
static int __init throtl_init(void)
1303
{
1304
	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1305
	if (!kthrotld_workqueue)
1306
		panic("Failed to create kthrotld\n");
1307

1308
	blkio_policy_register(&blkio_policy_throtl);
1309
	return 0;
1310
}
1311

1312
module_init(throtl_init);
1313

1314
Product

Resources

Company