Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/backing-dev.c
26135 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
#include <linux/blkdev.h>
4
#include <linux/wait.h>
5
#include <linux/rbtree.h>
6
#include <linux/kthread.h>
7
#include <linux/backing-dev.h>
8
#include <linux/blk-cgroup.h>
9
#include <linux/freezer.h>
10
#include <linux/fs.h>
11
#include <linux/pagemap.h>
12
#include <linux/mm.h>
13
#include <linux/sched/mm.h>
14
#include <linux/sched.h>
15
#include <linux/module.h>
16
#include <linux/writeback.h>
17
#include <linux/device.h>
18
#include <trace/events/writeback.h>
19
#include "internal.h"
20
21
struct backing_dev_info noop_backing_dev_info;
22
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
23
24
static const char *bdi_unknown_name = "(unknown)";
25
26
/*
27
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
28
* reader side locking.
29
*/
30
DEFINE_SPINLOCK(bdi_lock);
31
static u64 bdi_id_cursor;
32
static struct rb_root bdi_tree = RB_ROOT;
33
LIST_HEAD(bdi_list);
34
35
/* bdi_wq serves all asynchronous writeback tasks */
36
struct workqueue_struct *bdi_wq;
37
38
#ifdef CONFIG_DEBUG_FS
39
#include <linux/debugfs.h>
40
#include <linux/seq_file.h>
41
42
struct wb_stats {
43
unsigned long nr_dirty;
44
unsigned long nr_io;
45
unsigned long nr_more_io;
46
unsigned long nr_dirty_time;
47
unsigned long nr_writeback;
48
unsigned long nr_reclaimable;
49
unsigned long nr_dirtied;
50
unsigned long nr_written;
51
unsigned long dirty_thresh;
52
unsigned long wb_thresh;
53
};
54
55
static struct dentry *bdi_debug_root;
56
57
static void bdi_debug_init(void)
58
{
59
bdi_debug_root = debugfs_create_dir("bdi", NULL);
60
}
61
62
static void collect_wb_stats(struct wb_stats *stats,
63
struct bdi_writeback *wb)
64
{
65
struct inode *inode;
66
67
spin_lock(&wb->list_lock);
68
list_for_each_entry(inode, &wb->b_dirty, i_io_list)
69
stats->nr_dirty++;
70
list_for_each_entry(inode, &wb->b_io, i_io_list)
71
stats->nr_io++;
72
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
73
stats->nr_more_io++;
74
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
75
if (inode->i_state & I_DIRTY_TIME)
76
stats->nr_dirty_time++;
77
spin_unlock(&wb->list_lock);
78
79
stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
80
stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
81
stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
82
stats->nr_written += wb_stat(wb, WB_WRITTEN);
83
stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
84
}
85
86
#ifdef CONFIG_CGROUP_WRITEBACK
87
static void bdi_collect_stats(struct backing_dev_info *bdi,
88
struct wb_stats *stats)
89
{
90
struct bdi_writeback *wb;
91
92
rcu_read_lock();
93
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
94
if (!wb_tryget(wb))
95
continue;
96
97
collect_wb_stats(stats, wb);
98
wb_put(wb);
99
}
100
rcu_read_unlock();
101
}
102
#else
103
static void bdi_collect_stats(struct backing_dev_info *bdi,
104
struct wb_stats *stats)
105
{
106
collect_wb_stats(stats, &bdi->wb);
107
}
108
#endif
109
110
static int bdi_debug_stats_show(struct seq_file *m, void *v)
111
{
112
struct backing_dev_info *bdi = m->private;
113
unsigned long background_thresh;
114
unsigned long dirty_thresh;
115
struct wb_stats stats;
116
unsigned long tot_bw;
117
118
global_dirty_limits(&background_thresh, &dirty_thresh);
119
120
memset(&stats, 0, sizeof(stats));
121
stats.dirty_thresh = dirty_thresh;
122
bdi_collect_stats(bdi, &stats);
123
tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
124
125
seq_printf(m,
126
"BdiWriteback: %10lu kB\n"
127
"BdiReclaimable: %10lu kB\n"
128
"BdiDirtyThresh: %10lu kB\n"
129
"DirtyThresh: %10lu kB\n"
130
"BackgroundThresh: %10lu kB\n"
131
"BdiDirtied: %10lu kB\n"
132
"BdiWritten: %10lu kB\n"
133
"BdiWriteBandwidth: %10lu kBps\n"
134
"b_dirty: %10lu\n"
135
"b_io: %10lu\n"
136
"b_more_io: %10lu\n"
137
"b_dirty_time: %10lu\n"
138
"bdi_list: %10u\n"
139
"state: %10lx\n",
140
K(stats.nr_writeback),
141
K(stats.nr_reclaimable),
142
K(stats.wb_thresh),
143
K(dirty_thresh),
144
K(background_thresh),
145
K(stats.nr_dirtied),
146
K(stats.nr_written),
147
K(tot_bw),
148
stats.nr_dirty,
149
stats.nr_io,
150
stats.nr_more_io,
151
stats.nr_dirty_time,
152
!list_empty(&bdi->bdi_list), bdi->wb.state);
153
154
return 0;
155
}
156
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
157
158
static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
159
struct wb_stats *stats)
160
{
161
162
seq_printf(m,
163
"WbCgIno: %10lu\n"
164
"WbWriteback: %10lu kB\n"
165
"WbReclaimable: %10lu kB\n"
166
"WbDirtyThresh: %10lu kB\n"
167
"WbDirtied: %10lu kB\n"
168
"WbWritten: %10lu kB\n"
169
"WbWriteBandwidth: %10lu kBps\n"
170
"b_dirty: %10lu\n"
171
"b_io: %10lu\n"
172
"b_more_io: %10lu\n"
173
"b_dirty_time: %10lu\n"
174
"state: %10lx\n\n",
175
#ifdef CONFIG_CGROUP_WRITEBACK
176
cgroup_ino(wb->memcg_css->cgroup),
177
#else
178
1ul,
179
#endif
180
K(stats->nr_writeback),
181
K(stats->nr_reclaimable),
182
K(stats->wb_thresh),
183
K(stats->nr_dirtied),
184
K(stats->nr_written),
185
K(wb->avg_write_bandwidth),
186
stats->nr_dirty,
187
stats->nr_io,
188
stats->nr_more_io,
189
stats->nr_dirty_time,
190
wb->state);
191
}
192
193
static int cgwb_debug_stats_show(struct seq_file *m, void *v)
194
{
195
struct backing_dev_info *bdi = m->private;
196
unsigned long background_thresh;
197
unsigned long dirty_thresh;
198
struct bdi_writeback *wb;
199
200
global_dirty_limits(&background_thresh, &dirty_thresh);
201
202
rcu_read_lock();
203
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
204
struct wb_stats stats = { .dirty_thresh = dirty_thresh };
205
206
if (!wb_tryget(wb))
207
continue;
208
209
collect_wb_stats(&stats, wb);
210
211
/*
212
* Calculate thresh of wb in writeback cgroup which is min of
213
* thresh in global domain and thresh in cgroup domain. Drop
214
* rcu lock because cgwb_calc_thresh may sleep in
215
* cgroup_rstat_flush. We can do so here because we have a ref.
216
*/
217
if (mem_cgroup_wb_domain(wb)) {
218
rcu_read_unlock();
219
stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
220
rcu_read_lock();
221
}
222
223
wb_stats_show(m, wb, &stats);
224
225
wb_put(wb);
226
}
227
rcu_read_unlock();
228
229
return 0;
230
}
231
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
232
233
static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
234
{
235
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
236
237
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
238
&bdi_debug_stats_fops);
239
debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
240
&cgwb_debug_stats_fops);
241
}
242
243
static void bdi_debug_unregister(struct backing_dev_info *bdi)
244
{
245
debugfs_remove_recursive(bdi->debug_dir);
246
}
247
#else /* CONFIG_DEBUG_FS */
248
static inline void bdi_debug_init(void)
249
{
250
}
251
static inline void bdi_debug_register(struct backing_dev_info *bdi,
252
const char *name)
253
{
254
}
255
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
256
{
257
}
258
#endif /* CONFIG_DEBUG_FS */
259
260
static ssize_t read_ahead_kb_store(struct device *dev,
261
struct device_attribute *attr,
262
const char *buf, size_t count)
263
{
264
struct backing_dev_info *bdi = dev_get_drvdata(dev);
265
unsigned long read_ahead_kb;
266
ssize_t ret;
267
268
ret = kstrtoul(buf, 10, &read_ahead_kb);
269
if (ret < 0)
270
return ret;
271
272
bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
273
274
return count;
275
}
276
277
#define BDI_SHOW(name, expr) \
278
static ssize_t name##_show(struct device *dev, \
279
struct device_attribute *attr, char *buf) \
280
{ \
281
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
282
\
283
return sysfs_emit(buf, "%lld\n", (long long)expr); \
284
} \
285
static DEVICE_ATTR_RW(name);
286
287
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
288
289
static ssize_t min_ratio_store(struct device *dev,
290
struct device_attribute *attr, const char *buf, size_t count)
291
{
292
struct backing_dev_info *bdi = dev_get_drvdata(dev);
293
unsigned int ratio;
294
ssize_t ret;
295
296
ret = kstrtouint(buf, 10, &ratio);
297
if (ret < 0)
298
return ret;
299
300
ret = bdi_set_min_ratio(bdi, ratio);
301
if (!ret)
302
ret = count;
303
304
return ret;
305
}
306
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
307
308
static ssize_t min_ratio_fine_store(struct device *dev,
309
struct device_attribute *attr, const char *buf, size_t count)
310
{
311
struct backing_dev_info *bdi = dev_get_drvdata(dev);
312
unsigned int ratio;
313
ssize_t ret;
314
315
ret = kstrtouint(buf, 10, &ratio);
316
if (ret < 0)
317
return ret;
318
319
ret = bdi_set_min_ratio_no_scale(bdi, ratio);
320
if (!ret)
321
ret = count;
322
323
return ret;
324
}
325
BDI_SHOW(min_ratio_fine, bdi->min_ratio)
326
327
static ssize_t max_ratio_store(struct device *dev,
328
struct device_attribute *attr, const char *buf, size_t count)
329
{
330
struct backing_dev_info *bdi = dev_get_drvdata(dev);
331
unsigned int ratio;
332
ssize_t ret;
333
334
ret = kstrtouint(buf, 10, &ratio);
335
if (ret < 0)
336
return ret;
337
338
ret = bdi_set_max_ratio(bdi, ratio);
339
if (!ret)
340
ret = count;
341
342
return ret;
343
}
344
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
345
346
static ssize_t max_ratio_fine_store(struct device *dev,
347
struct device_attribute *attr, const char *buf, size_t count)
348
{
349
struct backing_dev_info *bdi = dev_get_drvdata(dev);
350
unsigned int ratio;
351
ssize_t ret;
352
353
ret = kstrtouint(buf, 10, &ratio);
354
if (ret < 0)
355
return ret;
356
357
ret = bdi_set_max_ratio_no_scale(bdi, ratio);
358
if (!ret)
359
ret = count;
360
361
return ret;
362
}
363
BDI_SHOW(max_ratio_fine, bdi->max_ratio)
364
365
static ssize_t min_bytes_show(struct device *dev,
366
struct device_attribute *attr,
367
char *buf)
368
{
369
struct backing_dev_info *bdi = dev_get_drvdata(dev);
370
371
return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
372
}
373
374
static ssize_t min_bytes_store(struct device *dev,
375
struct device_attribute *attr, const char *buf, size_t count)
376
{
377
struct backing_dev_info *bdi = dev_get_drvdata(dev);
378
u64 bytes;
379
ssize_t ret;
380
381
ret = kstrtoull(buf, 10, &bytes);
382
if (ret < 0)
383
return ret;
384
385
ret = bdi_set_min_bytes(bdi, bytes);
386
if (!ret)
387
ret = count;
388
389
return ret;
390
}
391
static DEVICE_ATTR_RW(min_bytes);
392
393
static ssize_t max_bytes_show(struct device *dev,
394
struct device_attribute *attr,
395
char *buf)
396
{
397
struct backing_dev_info *bdi = dev_get_drvdata(dev);
398
399
return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
400
}
401
402
static ssize_t max_bytes_store(struct device *dev,
403
struct device_attribute *attr, const char *buf, size_t count)
404
{
405
struct backing_dev_info *bdi = dev_get_drvdata(dev);
406
u64 bytes;
407
ssize_t ret;
408
409
ret = kstrtoull(buf, 10, &bytes);
410
if (ret < 0)
411
return ret;
412
413
ret = bdi_set_max_bytes(bdi, bytes);
414
if (!ret)
415
ret = count;
416
417
return ret;
418
}
419
static DEVICE_ATTR_RW(max_bytes);
420
421
static ssize_t stable_pages_required_show(struct device *dev,
422
struct device_attribute *attr,
423
char *buf)
424
{
425
dev_warn_once(dev,
426
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
427
return sysfs_emit(buf, "%d\n", 0);
428
}
429
static DEVICE_ATTR_RO(stable_pages_required);
430
431
static ssize_t strict_limit_store(struct device *dev,
432
struct device_attribute *attr, const char *buf, size_t count)
433
{
434
struct backing_dev_info *bdi = dev_get_drvdata(dev);
435
unsigned int strict_limit;
436
ssize_t ret;
437
438
ret = kstrtouint(buf, 10, &strict_limit);
439
if (ret < 0)
440
return ret;
441
442
ret = bdi_set_strict_limit(bdi, strict_limit);
443
if (!ret)
444
ret = count;
445
446
return ret;
447
}
448
449
static ssize_t strict_limit_show(struct device *dev,
450
struct device_attribute *attr, char *buf)
451
{
452
struct backing_dev_info *bdi = dev_get_drvdata(dev);
453
454
return sysfs_emit(buf, "%d\n",
455
!!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
456
}
457
static DEVICE_ATTR_RW(strict_limit);
458
459
static struct attribute *bdi_dev_attrs[] = {
460
&dev_attr_read_ahead_kb.attr,
461
&dev_attr_min_ratio.attr,
462
&dev_attr_min_ratio_fine.attr,
463
&dev_attr_max_ratio.attr,
464
&dev_attr_max_ratio_fine.attr,
465
&dev_attr_min_bytes.attr,
466
&dev_attr_max_bytes.attr,
467
&dev_attr_stable_pages_required.attr,
468
&dev_attr_strict_limit.attr,
469
NULL,
470
};
471
ATTRIBUTE_GROUPS(bdi_dev);
472
473
static const struct class bdi_class = {
474
.name = "bdi",
475
.dev_groups = bdi_dev_groups,
476
};
477
478
static __init int bdi_class_init(void)
479
{
480
int ret;
481
482
ret = class_register(&bdi_class);
483
if (ret)
484
return ret;
485
486
bdi_debug_init();
487
488
return 0;
489
}
490
postcore_initcall(bdi_class_init);
491
492
static int __init default_bdi_init(void)
493
{
494
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
495
WQ_SYSFS, 0);
496
if (!bdi_wq)
497
return -ENOMEM;
498
return 0;
499
}
500
subsys_initcall(default_bdi_init);
501
502
static void wb_update_bandwidth_workfn(struct work_struct *work)
503
{
504
struct bdi_writeback *wb = container_of(to_delayed_work(work),
505
struct bdi_writeback, bw_dwork);
506
507
wb_update_bandwidth(wb);
508
}
509
510
/*
511
* Initial write bandwidth: 100 MB/s
512
*/
513
#define INIT_BW (100 << (20 - PAGE_SHIFT))
514
515
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
516
gfp_t gfp)
517
{
518
int err;
519
520
memset(wb, 0, sizeof(*wb));
521
522
wb->bdi = bdi;
523
wb->last_old_flush = jiffies;
524
INIT_LIST_HEAD(&wb->b_dirty);
525
INIT_LIST_HEAD(&wb->b_io);
526
INIT_LIST_HEAD(&wb->b_more_io);
527
INIT_LIST_HEAD(&wb->b_dirty_time);
528
spin_lock_init(&wb->list_lock);
529
530
atomic_set(&wb->writeback_inodes, 0);
531
wb->bw_time_stamp = jiffies;
532
wb->balanced_dirty_ratelimit = INIT_BW;
533
wb->dirty_ratelimit = INIT_BW;
534
wb->write_bandwidth = INIT_BW;
535
wb->avg_write_bandwidth = INIT_BW;
536
537
spin_lock_init(&wb->work_lock);
538
INIT_LIST_HEAD(&wb->work_list);
539
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
540
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
541
542
err = fprop_local_init_percpu(&wb->completions, gfp);
543
if (err)
544
return err;
545
546
err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
547
if (err)
548
fprop_local_destroy_percpu(&wb->completions);
549
550
return err;
551
}
552
553
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
554
555
/*
556
* Remove bdi from the global list and shutdown any threads we have running
557
*/
558
static void wb_shutdown(struct bdi_writeback *wb)
559
{
560
/* Make sure nobody queues further work */
561
spin_lock_irq(&wb->work_lock);
562
if (!test_and_clear_bit(WB_registered, &wb->state)) {
563
spin_unlock_irq(&wb->work_lock);
564
return;
565
}
566
spin_unlock_irq(&wb->work_lock);
567
568
cgwb_remove_from_bdi_list(wb);
569
/*
570
* Drain work list and shutdown the delayed_work. !WB_registered
571
* tells wb_workfn() that @wb is dying and its work_list needs to
572
* be drained no matter what.
573
*/
574
mod_delayed_work(bdi_wq, &wb->dwork, 0);
575
flush_delayed_work(&wb->dwork);
576
WARN_ON(!list_empty(&wb->work_list));
577
flush_delayed_work(&wb->bw_dwork);
578
}
579
580
static void wb_exit(struct bdi_writeback *wb)
581
{
582
WARN_ON(delayed_work_pending(&wb->dwork));
583
percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
584
fprop_local_destroy_percpu(&wb->completions);
585
}
586
587
#ifdef CONFIG_CGROUP_WRITEBACK
588
589
#include <linux/memcontrol.h>
590
591
/*
592
* cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
593
* memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
594
*/
595
static DEFINE_SPINLOCK(cgwb_lock);
596
static struct workqueue_struct *cgwb_release_wq;
597
598
static LIST_HEAD(offline_cgwbs);
599
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
600
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
601
602
static void cgwb_free_rcu(struct rcu_head *rcu_head)
603
{
604
struct bdi_writeback *wb = container_of(rcu_head,
605
struct bdi_writeback, rcu);
606
607
percpu_ref_exit(&wb->refcnt);
608
kfree(wb);
609
}
610
611
static void cgwb_release_workfn(struct work_struct *work)
612
{
613
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
614
release_work);
615
struct backing_dev_info *bdi = wb->bdi;
616
617
mutex_lock(&wb->bdi->cgwb_release_mutex);
618
wb_shutdown(wb);
619
620
css_put(wb->memcg_css);
621
css_put(wb->blkcg_css);
622
mutex_unlock(&wb->bdi->cgwb_release_mutex);
623
624
/* triggers blkg destruction if no online users left */
625
blkcg_unpin_online(wb->blkcg_css);
626
627
fprop_local_destroy_percpu(&wb->memcg_completions);
628
629
spin_lock_irq(&cgwb_lock);
630
list_del(&wb->offline_node);
631
spin_unlock_irq(&cgwb_lock);
632
633
wb_exit(wb);
634
bdi_put(bdi);
635
WARN_ON_ONCE(!list_empty(&wb->b_attached));
636
call_rcu(&wb->rcu, cgwb_free_rcu);
637
}
638
639
static void cgwb_release(struct percpu_ref *refcnt)
640
{
641
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
642
refcnt);
643
queue_work(cgwb_release_wq, &wb->release_work);
644
}
645
646
static void cgwb_kill(struct bdi_writeback *wb)
647
{
648
lockdep_assert_held(&cgwb_lock);
649
650
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
651
list_del(&wb->memcg_node);
652
list_del(&wb->blkcg_node);
653
list_add(&wb->offline_node, &offline_cgwbs);
654
percpu_ref_kill(&wb->refcnt);
655
}
656
657
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
658
{
659
spin_lock_irq(&cgwb_lock);
660
list_del_rcu(&wb->bdi_node);
661
spin_unlock_irq(&cgwb_lock);
662
}
663
664
static int cgwb_create(struct backing_dev_info *bdi,
665
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
666
{
667
struct mem_cgroup *memcg;
668
struct cgroup_subsys_state *blkcg_css;
669
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
670
struct bdi_writeback *wb;
671
unsigned long flags;
672
int ret = 0;
673
674
memcg = mem_cgroup_from_css(memcg_css);
675
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
676
memcg_cgwb_list = &memcg->cgwb_list;
677
blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
678
679
/* look up again under lock and discard on blkcg mismatch */
680
spin_lock_irqsave(&cgwb_lock, flags);
681
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
682
if (wb && wb->blkcg_css != blkcg_css) {
683
cgwb_kill(wb);
684
wb = NULL;
685
}
686
spin_unlock_irqrestore(&cgwb_lock, flags);
687
if (wb)
688
goto out_put;
689
690
/* need to create a new one */
691
wb = kmalloc(sizeof(*wb), gfp);
692
if (!wb) {
693
ret = -ENOMEM;
694
goto out_put;
695
}
696
697
ret = wb_init(wb, bdi, gfp);
698
if (ret)
699
goto err_free;
700
701
ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
702
if (ret)
703
goto err_wb_exit;
704
705
ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
706
if (ret)
707
goto err_ref_exit;
708
709
wb->memcg_css = memcg_css;
710
wb->blkcg_css = blkcg_css;
711
INIT_LIST_HEAD(&wb->b_attached);
712
INIT_WORK(&wb->release_work, cgwb_release_workfn);
713
set_bit(WB_registered, &wb->state);
714
bdi_get(bdi);
715
716
/*
717
* The root wb determines the registered state of the whole bdi and
718
* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
719
* whether they're still online. Don't link @wb if any is dead.
720
* See wb_memcg_offline() and wb_blkcg_offline().
721
*/
722
ret = -ENODEV;
723
spin_lock_irqsave(&cgwb_lock, flags);
724
if (test_bit(WB_registered, &bdi->wb.state) &&
725
blkcg_cgwb_list->next && memcg_cgwb_list->next) {
726
/* we might have raced another instance of this function */
727
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
728
if (!ret) {
729
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
730
list_add(&wb->memcg_node, memcg_cgwb_list);
731
list_add(&wb->blkcg_node, blkcg_cgwb_list);
732
blkcg_pin_online(blkcg_css);
733
css_get(memcg_css);
734
css_get(blkcg_css);
735
}
736
}
737
spin_unlock_irqrestore(&cgwb_lock, flags);
738
if (ret) {
739
if (ret == -EEXIST)
740
ret = 0;
741
goto err_fprop_exit;
742
}
743
goto out_put;
744
745
err_fprop_exit:
746
bdi_put(bdi);
747
fprop_local_destroy_percpu(&wb->memcg_completions);
748
err_ref_exit:
749
percpu_ref_exit(&wb->refcnt);
750
err_wb_exit:
751
wb_exit(wb);
752
err_free:
753
kfree(wb);
754
out_put:
755
css_put(blkcg_css);
756
return ret;
757
}
758
759
/**
760
* wb_get_lookup - get wb for a given memcg
761
* @bdi: target bdi
762
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
763
*
764
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
765
* refcount incremented.
766
*
767
* This function uses css_get() on @memcg_css and thus expects its refcnt
768
* to be positive on invocation. IOW, rcu_read_lock() protection on
769
* @memcg_css isn't enough. try_get it before calling this function.
770
*
771
* A wb is keyed by its associated memcg. As blkcg implicitly enables
772
* memcg on the default hierarchy, memcg association is guaranteed to be
773
* more specific (equal or descendant to the associated blkcg) and thus can
774
* identify both the memcg and blkcg associations.
775
*
776
* Because the blkcg associated with a memcg may change as blkcg is enabled
777
* and disabled closer to root in the hierarchy, each wb keeps track of
778
* both the memcg and blkcg associated with it and verifies the blkcg on
779
* each lookup. On mismatch, the existing wb is discarded and a new one is
780
* created.
781
*/
782
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
783
struct cgroup_subsys_state *memcg_css)
784
{
785
struct bdi_writeback *wb;
786
787
if (!memcg_css->parent)
788
return &bdi->wb;
789
790
rcu_read_lock();
791
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
792
if (wb) {
793
struct cgroup_subsys_state *blkcg_css;
794
795
/* see whether the blkcg association has changed */
796
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
797
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
798
wb = NULL;
799
css_put(blkcg_css);
800
}
801
rcu_read_unlock();
802
803
return wb;
804
}
805
806
/**
807
* wb_get_create - get wb for a given memcg, create if necessary
808
* @bdi: target bdi
809
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
810
* @gfp: allocation mask to use
811
*
812
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
813
* create one. See wb_get_lookup() for more details.
814
*/
815
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
816
struct cgroup_subsys_state *memcg_css,
817
gfp_t gfp)
818
{
819
struct bdi_writeback *wb;
820
821
might_alloc(gfp);
822
823
do {
824
wb = wb_get_lookup(bdi, memcg_css);
825
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
826
827
return wb;
828
}
829
830
static int cgwb_bdi_init(struct backing_dev_info *bdi)
831
{
832
int ret;
833
834
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
835
mutex_init(&bdi->cgwb_release_mutex);
836
init_rwsem(&bdi->wb_switch_rwsem);
837
838
ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
839
if (!ret) {
840
bdi->wb.memcg_css = &root_mem_cgroup->css;
841
bdi->wb.blkcg_css = blkcg_root_css;
842
}
843
return ret;
844
}
845
846
static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
847
{
848
struct radix_tree_iter iter;
849
void **slot;
850
struct bdi_writeback *wb;
851
852
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
853
854
spin_lock_irq(&cgwb_lock);
855
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
856
cgwb_kill(*slot);
857
spin_unlock_irq(&cgwb_lock);
858
859
mutex_lock(&bdi->cgwb_release_mutex);
860
spin_lock_irq(&cgwb_lock);
861
while (!list_empty(&bdi->wb_list)) {
862
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
863
bdi_node);
864
spin_unlock_irq(&cgwb_lock);
865
wb_shutdown(wb);
866
spin_lock_irq(&cgwb_lock);
867
}
868
spin_unlock_irq(&cgwb_lock);
869
mutex_unlock(&bdi->cgwb_release_mutex);
870
}
871
872
/*
873
* cleanup_offline_cgwbs_workfn - try to release dying cgwbs
874
*
875
* Try to release dying cgwbs by switching attached inodes to the nearest
876
* living ancestor's writeback. Processed wbs are placed at the end
877
* of the list to guarantee the forward progress.
878
*/
879
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
880
{
881
struct bdi_writeback *wb;
882
LIST_HEAD(processed);
883
884
spin_lock_irq(&cgwb_lock);
885
886
while (!list_empty(&offline_cgwbs)) {
887
wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
888
offline_node);
889
list_move(&wb->offline_node, &processed);
890
891
/*
892
* If wb is dirty, cleaning up the writeback by switching
893
* attached inodes will result in an effective removal of any
894
* bandwidth restrictions, which isn't the goal. Instead,
895
* it can be postponed until the next time, when all io
896
* will be likely completed. If in the meantime some inodes
897
* will get re-dirtied, they should be eventually switched to
898
* a new cgwb.
899
*/
900
if (wb_has_dirty_io(wb))
901
continue;
902
903
if (!wb_tryget(wb))
904
continue;
905
906
spin_unlock_irq(&cgwb_lock);
907
while (cleanup_offline_cgwb(wb))
908
cond_resched();
909
spin_lock_irq(&cgwb_lock);
910
911
wb_put(wb);
912
}
913
914
if (!list_empty(&processed))
915
list_splice_tail(&processed, &offline_cgwbs);
916
917
spin_unlock_irq(&cgwb_lock);
918
}
919
920
/**
921
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
922
* @memcg: memcg being offlined
923
*
924
* Also prevents creation of any new wb's associated with @memcg.
925
*/
926
void wb_memcg_offline(struct mem_cgroup *memcg)
927
{
928
struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
929
struct bdi_writeback *wb, *next;
930
931
spin_lock_irq(&cgwb_lock);
932
list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
933
cgwb_kill(wb);
934
memcg_cgwb_list->next = NULL; /* prevent new wb's */
935
spin_unlock_irq(&cgwb_lock);
936
937
queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
938
}
939
940
/**
941
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
942
* @css: blkcg being offlined
943
*
944
* Also prevents creation of any new wb's associated with @blkcg.
945
*/
946
void wb_blkcg_offline(struct cgroup_subsys_state *css)
947
{
948
struct bdi_writeback *wb, *next;
949
struct list_head *list = blkcg_get_cgwb_list(css);
950
951
spin_lock_irq(&cgwb_lock);
952
list_for_each_entry_safe(wb, next, list, blkcg_node)
953
cgwb_kill(wb);
954
list->next = NULL; /* prevent new wb's */
955
spin_unlock_irq(&cgwb_lock);
956
}
957
958
static void cgwb_bdi_register(struct backing_dev_info *bdi)
959
{
960
spin_lock_irq(&cgwb_lock);
961
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
962
spin_unlock_irq(&cgwb_lock);
963
}
964
965
static int __init cgwb_init(void)
966
{
967
/*
968
* There can be many concurrent release work items overwhelming
969
* system_wq. Put them in a separate wq and limit concurrency.
970
* There's no point in executing many of these in parallel.
971
*/
972
cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
973
if (!cgwb_release_wq)
974
return -ENOMEM;
975
976
return 0;
977
}
978
subsys_initcall(cgwb_init);
979
980
#else /* CONFIG_CGROUP_WRITEBACK */
981
982
static int cgwb_bdi_init(struct backing_dev_info *bdi)
983
{
984
return wb_init(&bdi->wb, bdi, GFP_KERNEL);
985
}
986
987
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
988
989
static void cgwb_bdi_register(struct backing_dev_info *bdi)
990
{
991
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
992
}
993
994
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
995
{
996
list_del_rcu(&wb->bdi_node);
997
}
998
999
#endif /* CONFIG_CGROUP_WRITEBACK */
1000
1001
int bdi_init(struct backing_dev_info *bdi)
1002
{
1003
bdi->dev = NULL;
1004
1005
kref_init(&bdi->refcnt);
1006
bdi->min_ratio = 0;
1007
bdi->max_ratio = 100 * BDI_RATIO_SCALE;
1008
bdi->max_prop_frac = FPROP_FRAC_BASE;
1009
INIT_LIST_HEAD(&bdi->bdi_list);
1010
INIT_LIST_HEAD(&bdi->wb_list);
1011
init_waitqueue_head(&bdi->wb_waitq);
1012
bdi->last_bdp_sleep = jiffies;
1013
1014
return cgwb_bdi_init(bdi);
1015
}
1016
1017
struct backing_dev_info *bdi_alloc(int node_id)
1018
{
1019
struct backing_dev_info *bdi;
1020
1021
bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
1022
if (!bdi)
1023
return NULL;
1024
1025
if (bdi_init(bdi)) {
1026
kfree(bdi);
1027
return NULL;
1028
}
1029
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
1030
bdi->ra_pages = VM_READAHEAD_PAGES;
1031
bdi->io_pages = VM_READAHEAD_PAGES;
1032
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
1033
return bdi;
1034
}
1035
EXPORT_SYMBOL(bdi_alloc);
1036
1037
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
1038
{
1039
struct rb_node **p = &bdi_tree.rb_node;
1040
struct rb_node *parent = NULL;
1041
struct backing_dev_info *bdi;
1042
1043
lockdep_assert_held(&bdi_lock);
1044
1045
while (*p) {
1046
parent = *p;
1047
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
1048
1049
if (bdi->id > id)
1050
p = &(*p)->rb_left;
1051
else if (bdi->id < id)
1052
p = &(*p)->rb_right;
1053
else
1054
break;
1055
}
1056
1057
if (parentp)
1058
*parentp = parent;
1059
return p;
1060
}
1061
1062
/**
1063
* bdi_get_by_id - lookup and get bdi from its id
1064
* @id: bdi id to lookup
1065
*
1066
* Find bdi matching @id and get it. Returns NULL if the matching bdi
1067
* doesn't exist or is already unregistered.
1068
*/
1069
struct backing_dev_info *bdi_get_by_id(u64 id)
1070
{
1071
struct backing_dev_info *bdi = NULL;
1072
struct rb_node **p;
1073
1074
spin_lock_bh(&bdi_lock);
1075
p = bdi_lookup_rb_node(id, NULL);
1076
if (*p) {
1077
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
1078
bdi_get(bdi);
1079
}
1080
spin_unlock_bh(&bdi_lock);
1081
1082
return bdi;
1083
}
1084
1085
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
1086
{
1087
struct device *dev;
1088
struct rb_node *parent, **p;
1089
1090
if (bdi->dev) /* The driver needs to use separate queues per device */
1091
return 0;
1092
1093
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
1094
dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
1095
if (IS_ERR(dev))
1096
return PTR_ERR(dev);
1097
1098
cgwb_bdi_register(bdi);
1099
bdi->dev = dev;
1100
1101
bdi_debug_register(bdi, dev_name(dev));
1102
set_bit(WB_registered, &bdi->wb.state);
1103
1104
spin_lock_bh(&bdi_lock);
1105
1106
bdi->id = ++bdi_id_cursor;
1107
1108
p = bdi_lookup_rb_node(bdi->id, &parent);
1109
rb_link_node(&bdi->rb_node, parent, p);
1110
rb_insert_color(&bdi->rb_node, &bdi_tree);
1111
1112
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
1113
1114
spin_unlock_bh(&bdi_lock);
1115
1116
trace_writeback_bdi_register(bdi);
1117
return 0;
1118
}
1119
1120
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
1121
{
1122
va_list args;
1123
int ret;
1124
1125
va_start(args, fmt);
1126
ret = bdi_register_va(bdi, fmt, args);
1127
va_end(args);
1128
return ret;
1129
}
1130
EXPORT_SYMBOL(bdi_register);
1131
1132
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
1133
{
1134
WARN_ON_ONCE(bdi->owner);
1135
bdi->owner = owner;
1136
get_device(owner);
1137
}
1138
1139
/*
1140
* Remove bdi from bdi_list, and ensure that it is no longer visible
1141
*/
1142
static void bdi_remove_from_list(struct backing_dev_info *bdi)
1143
{
1144
spin_lock_bh(&bdi_lock);
1145
rb_erase(&bdi->rb_node, &bdi_tree);
1146
list_del_rcu(&bdi->bdi_list);
1147
spin_unlock_bh(&bdi_lock);
1148
1149
synchronize_rcu_expedited();
1150
}
1151
1152
void bdi_unregister(struct backing_dev_info *bdi)
1153
{
1154
timer_delete_sync(&bdi->laptop_mode_wb_timer);
1155
1156
/* make sure nobody finds us on the bdi_list anymore */
1157
bdi_remove_from_list(bdi);
1158
wb_shutdown(&bdi->wb);
1159
cgwb_bdi_unregister(bdi);
1160
1161
/*
1162
* If this BDI's min ratio has been set, use bdi_set_min_ratio() to
1163
* update the global bdi_min_ratio.
1164
*/
1165
if (bdi->min_ratio)
1166
bdi_set_min_ratio(bdi, 0);
1167
1168
if (bdi->dev) {
1169
bdi_debug_unregister(bdi);
1170
device_unregister(bdi->dev);
1171
bdi->dev = NULL;
1172
}
1173
1174
if (bdi->owner) {
1175
put_device(bdi->owner);
1176
bdi->owner = NULL;
1177
}
1178
}
1179
EXPORT_SYMBOL(bdi_unregister);
1180
1181
static void release_bdi(struct kref *ref)
1182
{
1183
struct backing_dev_info *bdi =
1184
container_of(ref, struct backing_dev_info, refcnt);
1185
1186
WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
1187
WARN_ON_ONCE(bdi->dev);
1188
wb_exit(&bdi->wb);
1189
kfree(bdi);
1190
}
1191
1192
void bdi_put(struct backing_dev_info *bdi)
1193
{
1194
kref_put(&bdi->refcnt, release_bdi);
1195
}
1196
EXPORT_SYMBOL(bdi_put);
1197
1198
struct backing_dev_info *inode_to_bdi(struct inode *inode)
1199
{
1200
struct super_block *sb;
1201
1202
if (!inode)
1203
return &noop_backing_dev_info;
1204
1205
sb = inode->i_sb;
1206
#ifdef CONFIG_BLOCK
1207
if (sb_is_blkdev_sb(sb))
1208
return I_BDEV(inode)->bd_disk->bdi;
1209
#endif
1210
return sb->s_bdi;
1211
}
1212
EXPORT_SYMBOL(inode_to_bdi);
1213
1214
const char *bdi_dev_name(struct backing_dev_info *bdi)
1215
{
1216
if (!bdi || !bdi->dev)
1217
return bdi_unknown_name;
1218
return bdi->dev_name;
1219
}
1220
EXPORT_SYMBOL_GPL(bdi_dev_name);
1221
1222