Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/fs/buffer.c
15109 views
1
/*
2
* linux/fs/buffer.c
3
*
4
* Copyright (C) 1991, 1992, 2002 Linus Torvalds
5
*/
6
7
/*
8
* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9
*
10
* Removed a lot of unnecessary code and simplified things now that
11
* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12
*
13
* Speed up hash, lru, and free list operations. Use gfp() for allocating
14
* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15
*
16
* Added 32k buffer block sizes - these are required older ARM systems. - RMK
17
*
18
* async buffer flushing, 1999 Andrea Arcangeli <[email protected]>
19
*/
20
21
#include <linux/kernel.h>
22
#include <linux/syscalls.h>
23
#include <linux/fs.h>
24
#include <linux/mm.h>
25
#include <linux/percpu.h>
26
#include <linux/slab.h>
27
#include <linux/capability.h>
28
#include <linux/blkdev.h>
29
#include <linux/file.h>
30
#include <linux/quotaops.h>
31
#include <linux/highmem.h>
32
#include <linux/module.h>
33
#include <linux/writeback.h>
34
#include <linux/hash.h>
35
#include <linux/suspend.h>
36
#include <linux/buffer_head.h>
37
#include <linux/task_io_accounting_ops.h>
38
#include <linux/bio.h>
39
#include <linux/notifier.h>
40
#include <linux/cpu.h>
41
#include <linux/bitops.h>
42
#include <linux/mpage.h>
43
#include <linux/bit_spinlock.h>
44
#include <linux/cleancache.h>
45
46
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47
48
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49
50
inline void
51
init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52
{
53
bh->b_end_io = handler;
54
bh->b_private = private;
55
}
56
EXPORT_SYMBOL(init_buffer);
57
58
static int sleep_on_buffer(void *word)
59
{
60
io_schedule();
61
return 0;
62
}
63
64
void __lock_buffer(struct buffer_head *bh)
65
{
66
wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
67
TASK_UNINTERRUPTIBLE);
68
}
69
EXPORT_SYMBOL(__lock_buffer);
70
71
void unlock_buffer(struct buffer_head *bh)
72
{
73
clear_bit_unlock(BH_Lock, &bh->b_state);
74
smp_mb__after_clear_bit();
75
wake_up_bit(&bh->b_state, BH_Lock);
76
}
77
EXPORT_SYMBOL(unlock_buffer);
78
79
/*
80
* Block until a buffer comes unlocked. This doesn't stop it
81
* from becoming locked again - you have to lock it yourself
82
* if you want to preserve its state.
83
*/
84
void __wait_on_buffer(struct buffer_head * bh)
85
{
86
wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
87
}
88
EXPORT_SYMBOL(__wait_on_buffer);
89
90
static void
91
__clear_page_buffers(struct page *page)
92
{
93
ClearPagePrivate(page);
94
set_page_private(page, 0);
95
page_cache_release(page);
96
}
97
98
99
static int quiet_error(struct buffer_head *bh)
100
{
101
if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
102
return 0;
103
return 1;
104
}
105
106
107
static void buffer_io_error(struct buffer_head *bh)
108
{
109
char b[BDEVNAME_SIZE];
110
printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
111
bdevname(bh->b_bdev, b),
112
(unsigned long long)bh->b_blocknr);
113
}
114
115
/*
116
* End-of-IO handler helper function which does not touch the bh after
117
* unlocking it.
118
* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
119
* a race there is benign: unlock_buffer() only use the bh's address for
120
* hashing after unlocking the buffer, so it doesn't actually touch the bh
121
* itself.
122
*/
123
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
124
{
125
if (uptodate) {
126
set_buffer_uptodate(bh);
127
} else {
128
/* This happens, due to failed READA attempts. */
129
clear_buffer_uptodate(bh);
130
}
131
unlock_buffer(bh);
132
}
133
134
/*
135
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
136
* unlock the buffer. This is what ll_rw_block uses too.
137
*/
138
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
139
{
140
__end_buffer_read_notouch(bh, uptodate);
141
put_bh(bh);
142
}
143
EXPORT_SYMBOL(end_buffer_read_sync);
144
145
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
146
{
147
char b[BDEVNAME_SIZE];
148
149
if (uptodate) {
150
set_buffer_uptodate(bh);
151
} else {
152
if (!quiet_error(bh)) {
153
buffer_io_error(bh);
154
printk(KERN_WARNING "lost page write due to "
155
"I/O error on %s\n",
156
bdevname(bh->b_bdev, b));
157
}
158
set_buffer_write_io_error(bh);
159
clear_buffer_uptodate(bh);
160
}
161
unlock_buffer(bh);
162
put_bh(bh);
163
}
164
EXPORT_SYMBOL(end_buffer_write_sync);
165
166
/*
167
* Various filesystems appear to want __find_get_block to be non-blocking.
168
* But it's the page lock which protects the buffers. To get around this,
169
* we get exclusion from try_to_free_buffers with the blockdev mapping's
170
* private_lock.
171
*
172
* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
173
* may be quite high. This code could TryLock the page, and if that
174
* succeeds, there is no need to take private_lock. (But if
175
* private_lock is contended then so is mapping->tree_lock).
176
*/
177
static struct buffer_head *
178
__find_get_block_slow(struct block_device *bdev, sector_t block)
179
{
180
struct inode *bd_inode = bdev->bd_inode;
181
struct address_space *bd_mapping = bd_inode->i_mapping;
182
struct buffer_head *ret = NULL;
183
pgoff_t index;
184
struct buffer_head *bh;
185
struct buffer_head *head;
186
struct page *page;
187
int all_mapped = 1;
188
189
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
190
page = find_get_page(bd_mapping, index);
191
if (!page)
192
goto out;
193
194
spin_lock(&bd_mapping->private_lock);
195
if (!page_has_buffers(page))
196
goto out_unlock;
197
head = page_buffers(page);
198
bh = head;
199
do {
200
if (!buffer_mapped(bh))
201
all_mapped = 0;
202
else if (bh->b_blocknr == block) {
203
ret = bh;
204
get_bh(bh);
205
goto out_unlock;
206
}
207
bh = bh->b_this_page;
208
} while (bh != head);
209
210
/* we might be here because some of the buffers on this page are
211
* not mapped. This is due to various races between
212
* file io on the block device and getblk. It gets dealt with
213
* elsewhere, don't buffer_error if we had some unmapped buffers
214
*/
215
if (all_mapped) {
216
printk("__find_get_block_slow() failed. "
217
"block=%llu, b_blocknr=%llu\n",
218
(unsigned long long)block,
219
(unsigned long long)bh->b_blocknr);
220
printk("b_state=0x%08lx, b_size=%zu\n",
221
bh->b_state, bh->b_size);
222
printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
223
}
224
out_unlock:
225
spin_unlock(&bd_mapping->private_lock);
226
page_cache_release(page);
227
out:
228
return ret;
229
}
230
231
/* If invalidate_buffers() will trash dirty buffers, it means some kind
232
of fs corruption is going on. Trashing dirty data always imply losing
233
information that was supposed to be just stored on the physical layer
234
by the user.
235
236
Thus invalidate_buffers in general usage is not allwowed to trash
237
dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
238
be preserved. These buffers are simply skipped.
239
240
We also skip buffers which are still in use. For example this can
241
happen if a userspace program is reading the block device.
242
243
NOTE: In the case where the user removed a removable-media-disk even if
244
there's still dirty data not synced on disk (due a bug in the device driver
245
or due an error of the user), by not destroying the dirty buffers we could
246
generate corruption also on the next media inserted, thus a parameter is
247
necessary to handle this case in the most safe way possible (trying
248
to not corrupt also the new disk inserted with the data belonging to
249
the old now corrupted disk). Also for the ramdisk the natural thing
250
to do in order to release the ramdisk memory is to destroy dirty buffers.
251
252
These are two special cases. Normal usage imply the device driver
253
to issue a sync on the device (without waiting I/O completion) and
254
then an invalidate_buffers call that doesn't trash dirty buffers.
255
256
For handling cache coherency with the blkdev pagecache the 'update' case
257
is been introduced. It is needed to re-read from disk any pinned
258
buffer. NOTE: re-reading from disk is destructive so we can do it only
259
when we assume nobody is changing the buffercache under our I/O and when
260
we think the disk contains more recent information than the buffercache.
261
The update == 1 pass marks the buffers we need to update, the update == 2
262
pass does the actual I/O. */
263
void invalidate_bdev(struct block_device *bdev)
264
{
265
struct address_space *mapping = bdev->bd_inode->i_mapping;
266
267
if (mapping->nrpages == 0)
268
return;
269
270
invalidate_bh_lrus();
271
lru_add_drain_all(); /* make sure all lru add caches are flushed */
272
invalidate_mapping_pages(mapping, 0, -1);
273
/* 99% of the time, we don't need to flush the cleancache on the bdev.
274
* But, for the strange corners, lets be cautious
275
*/
276
cleancache_flush_inode(mapping);
277
}
278
EXPORT_SYMBOL(invalidate_bdev);
279
280
/*
281
* Kick the writeback threads then try to free up some ZONE_NORMAL memory.
282
*/
283
static void free_more_memory(void)
284
{
285
struct zone *zone;
286
int nid;
287
288
wakeup_flusher_threads(1024);
289
yield();
290
291
for_each_online_node(nid) {
292
(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
293
gfp_zone(GFP_NOFS), NULL,
294
&zone);
295
if (zone)
296
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
297
GFP_NOFS, NULL);
298
}
299
}
300
301
/*
302
* I/O completion handler for block_read_full_page() - pages
303
* which come unlocked at the end of I/O.
304
*/
305
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
306
{
307
unsigned long flags;
308
struct buffer_head *first;
309
struct buffer_head *tmp;
310
struct page *page;
311
int page_uptodate = 1;
312
313
BUG_ON(!buffer_async_read(bh));
314
315
page = bh->b_page;
316
if (uptodate) {
317
set_buffer_uptodate(bh);
318
} else {
319
clear_buffer_uptodate(bh);
320
if (!quiet_error(bh))
321
buffer_io_error(bh);
322
SetPageError(page);
323
}
324
325
/*
326
* Be _very_ careful from here on. Bad things can happen if
327
* two buffer heads end IO at almost the same time and both
328
* decide that the page is now completely done.
329
*/
330
first = page_buffers(page);
331
local_irq_save(flags);
332
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
333
clear_buffer_async_read(bh);
334
unlock_buffer(bh);
335
tmp = bh;
336
do {
337
if (!buffer_uptodate(tmp))
338
page_uptodate = 0;
339
if (buffer_async_read(tmp)) {
340
BUG_ON(!buffer_locked(tmp));
341
goto still_busy;
342
}
343
tmp = tmp->b_this_page;
344
} while (tmp != bh);
345
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
346
local_irq_restore(flags);
347
348
/*
349
* If none of the buffers had errors and they are all
350
* uptodate then we can set the page uptodate.
351
*/
352
if (page_uptodate && !PageError(page))
353
SetPageUptodate(page);
354
unlock_page(page);
355
return;
356
357
still_busy:
358
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
359
local_irq_restore(flags);
360
return;
361
}
362
363
/*
364
* Completion handler for block_write_full_page() - pages which are unlocked
365
* during I/O, and which have PageWriteback cleared upon I/O completion.
366
*/
367
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
368
{
369
char b[BDEVNAME_SIZE];
370
unsigned long flags;
371
struct buffer_head *first;
372
struct buffer_head *tmp;
373
struct page *page;
374
375
BUG_ON(!buffer_async_write(bh));
376
377
page = bh->b_page;
378
if (uptodate) {
379
set_buffer_uptodate(bh);
380
} else {
381
if (!quiet_error(bh)) {
382
buffer_io_error(bh);
383
printk(KERN_WARNING "lost page write due to "
384
"I/O error on %s\n",
385
bdevname(bh->b_bdev, b));
386
}
387
set_bit(AS_EIO, &page->mapping->flags);
388
set_buffer_write_io_error(bh);
389
clear_buffer_uptodate(bh);
390
SetPageError(page);
391
}
392
393
first = page_buffers(page);
394
local_irq_save(flags);
395
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
396
397
clear_buffer_async_write(bh);
398
unlock_buffer(bh);
399
tmp = bh->b_this_page;
400
while (tmp != bh) {
401
if (buffer_async_write(tmp)) {
402
BUG_ON(!buffer_locked(tmp));
403
goto still_busy;
404
}
405
tmp = tmp->b_this_page;
406
}
407
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
408
local_irq_restore(flags);
409
end_page_writeback(page);
410
return;
411
412
still_busy:
413
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
414
local_irq_restore(flags);
415
return;
416
}
417
EXPORT_SYMBOL(end_buffer_async_write);
418
419
/*
420
* If a page's buffers are under async readin (end_buffer_async_read
421
* completion) then there is a possibility that another thread of
422
* control could lock one of the buffers after it has completed
423
* but while some of the other buffers have not completed. This
424
* locked buffer would confuse end_buffer_async_read() into not unlocking
425
* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
426
* that this buffer is not under async I/O.
427
*
428
* The page comes unlocked when it has no locked buffer_async buffers
429
* left.
430
*
431
* PageLocked prevents anyone starting new async I/O reads any of
432
* the buffers.
433
*
434
* PageWriteback is used to prevent simultaneous writeout of the same
435
* page.
436
*
437
* PageLocked prevents anyone from starting writeback of a page which is
438
* under read I/O (PageWriteback is only ever set against a locked page).
439
*/
440
static void mark_buffer_async_read(struct buffer_head *bh)
441
{
442
bh->b_end_io = end_buffer_async_read;
443
set_buffer_async_read(bh);
444
}
445
446
static void mark_buffer_async_write_endio(struct buffer_head *bh,
447
bh_end_io_t *handler)
448
{
449
bh->b_end_io = handler;
450
set_buffer_async_write(bh);
451
}
452
453
void mark_buffer_async_write(struct buffer_head *bh)
454
{
455
mark_buffer_async_write_endio(bh, end_buffer_async_write);
456
}
457
EXPORT_SYMBOL(mark_buffer_async_write);
458
459
460
/*
461
* fs/buffer.c contains helper functions for buffer-backed address space's
462
* fsync functions. A common requirement for buffer-based filesystems is
463
* that certain data from the backing blockdev needs to be written out for
464
* a successful fsync(). For example, ext2 indirect blocks need to be
465
* written back and waited upon before fsync() returns.
466
*
467
* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
468
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
469
* management of a list of dependent buffers at ->i_mapping->private_list.
470
*
471
* Locking is a little subtle: try_to_free_buffers() will remove buffers
472
* from their controlling inode's queue when they are being freed. But
473
* try_to_free_buffers() will be operating against the *blockdev* mapping
474
* at the time, not against the S_ISREG file which depends on those buffers.
475
* So the locking for private_list is via the private_lock in the address_space
476
* which backs the buffers. Which is different from the address_space
477
* against which the buffers are listed. So for a particular address_space,
478
* mapping->private_lock does *not* protect mapping->private_list! In fact,
479
* mapping->private_list will always be protected by the backing blockdev's
480
* ->private_lock.
481
*
482
* Which introduces a requirement: all buffers on an address_space's
483
* ->private_list must be from the same address_space: the blockdev's.
484
*
485
* address_spaces which do not place buffers at ->private_list via these
486
* utility functions are free to use private_lock and private_list for
487
* whatever they want. The only requirement is that list_empty(private_list)
488
* be true at clear_inode() time.
489
*
490
* FIXME: clear_inode should not call invalidate_inode_buffers(). The
491
* filesystems should do that. invalidate_inode_buffers() should just go
492
* BUG_ON(!list_empty).
493
*
494
* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
495
* take an address_space, not an inode. And it should be called
496
* mark_buffer_dirty_fsync() to clearly define why those buffers are being
497
* queued up.
498
*
499
* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
500
* list if it is already on a list. Because if the buffer is on a list,
501
* it *must* already be on the right one. If not, the filesystem is being
502
* silly. This will save a ton of locking. But first we have to ensure
503
* that buffers are taken *off* the old inode's list when they are freed
504
* (presumably in truncate). That requires careful auditing of all
505
* filesystems (do it inside bforget()). It could also be done by bringing
506
* b_inode back.
507
*/
508
509
/*
510
* The buffer's backing address_space's private_lock must be held
511
*/
512
static void __remove_assoc_queue(struct buffer_head *bh)
513
{
514
list_del_init(&bh->b_assoc_buffers);
515
WARN_ON(!bh->b_assoc_map);
516
if (buffer_write_io_error(bh))
517
set_bit(AS_EIO, &bh->b_assoc_map->flags);
518
bh->b_assoc_map = NULL;
519
}
520
521
int inode_has_buffers(struct inode *inode)
522
{
523
return !list_empty(&inode->i_data.private_list);
524
}
525
526
/*
527
* osync is designed to support O_SYNC io. It waits synchronously for
528
* all already-submitted IO to complete, but does not queue any new
529
* writes to the disk.
530
*
531
* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
532
* you dirty the buffers, and then use osync_inode_buffers to wait for
533
* completion. Any other dirty buffers which are not yet queued for
534
* write will not be flushed to disk by the osync.
535
*/
536
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
537
{
538
struct buffer_head *bh;
539
struct list_head *p;
540
int err = 0;
541
542
spin_lock(lock);
543
repeat:
544
list_for_each_prev(p, list) {
545
bh = BH_ENTRY(p);
546
if (buffer_locked(bh)) {
547
get_bh(bh);
548
spin_unlock(lock);
549
wait_on_buffer(bh);
550
if (!buffer_uptodate(bh))
551
err = -EIO;
552
brelse(bh);
553
spin_lock(lock);
554
goto repeat;
555
}
556
}
557
spin_unlock(lock);
558
return err;
559
}
560
561
static void do_thaw_one(struct super_block *sb, void *unused)
562
{
563
char b[BDEVNAME_SIZE];
564
while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
565
printk(KERN_WARNING "Emergency Thaw on %s\n",
566
bdevname(sb->s_bdev, b));
567
}
568
569
static void do_thaw_all(struct work_struct *work)
570
{
571
iterate_supers(do_thaw_one, NULL);
572
kfree(work);
573
printk(KERN_WARNING "Emergency Thaw complete\n");
574
}
575
576
/**
577
* emergency_thaw_all -- forcibly thaw every frozen filesystem
578
*
579
* Used for emergency unfreeze of all filesystems via SysRq
580
*/
581
void emergency_thaw_all(void)
582
{
583
struct work_struct *work;
584
585
work = kmalloc(sizeof(*work), GFP_ATOMIC);
586
if (work) {
587
INIT_WORK(work, do_thaw_all);
588
schedule_work(work);
589
}
590
}
591
592
/**
593
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
594
* @mapping: the mapping which wants those buffers written
595
*
596
* Starts I/O against the buffers at mapping->private_list, and waits upon
597
* that I/O.
598
*
599
* Basically, this is a convenience function for fsync().
600
* @mapping is a file or directory which needs those buffers to be written for
601
* a successful fsync().
602
*/
603
int sync_mapping_buffers(struct address_space *mapping)
604
{
605
struct address_space *buffer_mapping = mapping->assoc_mapping;
606
607
if (buffer_mapping == NULL || list_empty(&mapping->private_list))
608
return 0;
609
610
return fsync_buffers_list(&buffer_mapping->private_lock,
611
&mapping->private_list);
612
}
613
EXPORT_SYMBOL(sync_mapping_buffers);
614
615
/*
616
* Called when we've recently written block `bblock', and it is known that
617
* `bblock' was for a buffer_boundary() buffer. This means that the block at
618
* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
619
* dirty, schedule it for IO. So that indirects merge nicely with their data.
620
*/
621
void write_boundary_block(struct block_device *bdev,
622
sector_t bblock, unsigned blocksize)
623
{
624
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
625
if (bh) {
626
if (buffer_dirty(bh))
627
ll_rw_block(WRITE, 1, &bh);
628
put_bh(bh);
629
}
630
}
631
632
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
633
{
634
struct address_space *mapping = inode->i_mapping;
635
struct address_space *buffer_mapping = bh->b_page->mapping;
636
637
mark_buffer_dirty(bh);
638
if (!mapping->assoc_mapping) {
639
mapping->assoc_mapping = buffer_mapping;
640
} else {
641
BUG_ON(mapping->assoc_mapping != buffer_mapping);
642
}
643
if (!bh->b_assoc_map) {
644
spin_lock(&buffer_mapping->private_lock);
645
list_move_tail(&bh->b_assoc_buffers,
646
&mapping->private_list);
647
bh->b_assoc_map = mapping;
648
spin_unlock(&buffer_mapping->private_lock);
649
}
650
}
651
EXPORT_SYMBOL(mark_buffer_dirty_inode);
652
653
/*
654
* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
655
* dirty.
656
*
657
* If warn is true, then emit a warning if the page is not uptodate and has
658
* not been truncated.
659
*/
660
static void __set_page_dirty(struct page *page,
661
struct address_space *mapping, int warn)
662
{
663
spin_lock_irq(&mapping->tree_lock);
664
if (page->mapping) { /* Race with truncate? */
665
WARN_ON_ONCE(warn && !PageUptodate(page));
666
account_page_dirtied(page, mapping);
667
radix_tree_tag_set(&mapping->page_tree,
668
page_index(page), PAGECACHE_TAG_DIRTY);
669
}
670
spin_unlock_irq(&mapping->tree_lock);
671
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
672
}
673
674
/*
675
* Add a page to the dirty page list.
676
*
677
* It is a sad fact of life that this function is called from several places
678
* deeply under spinlocking. It may not sleep.
679
*
680
* If the page has buffers, the uptodate buffers are set dirty, to preserve
681
* dirty-state coherency between the page and the buffers. It the page does
682
* not have buffers then when they are later attached they will all be set
683
* dirty.
684
*
685
* The buffers are dirtied before the page is dirtied. There's a small race
686
* window in which a writepage caller may see the page cleanness but not the
687
* buffer dirtiness. That's fine. If this code were to set the page dirty
688
* before the buffers, a concurrent writepage caller could clear the page dirty
689
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
690
* page on the dirty page list.
691
*
692
* We use private_lock to lock against try_to_free_buffers while using the
693
* page's buffer list. Also use this to protect against clean buffers being
694
* added to the page after it was set dirty.
695
*
696
* FIXME: may need to call ->reservepage here as well. That's rather up to the
697
* address_space though.
698
*/
699
int __set_page_dirty_buffers(struct page *page)
700
{
701
int newly_dirty;
702
struct address_space *mapping = page_mapping(page);
703
704
if (unlikely(!mapping))
705
return !TestSetPageDirty(page);
706
707
spin_lock(&mapping->private_lock);
708
if (page_has_buffers(page)) {
709
struct buffer_head *head = page_buffers(page);
710
struct buffer_head *bh = head;
711
712
do {
713
set_buffer_dirty(bh);
714
bh = bh->b_this_page;
715
} while (bh != head);
716
}
717
newly_dirty = !TestSetPageDirty(page);
718
spin_unlock(&mapping->private_lock);
719
720
if (newly_dirty)
721
__set_page_dirty(page, mapping, 1);
722
return newly_dirty;
723
}
724
EXPORT_SYMBOL(__set_page_dirty_buffers);
725
726
/*
727
* Write out and wait upon a list of buffers.
728
*
729
* We have conflicting pressures: we want to make sure that all
730
* initially dirty buffers get waited on, but that any subsequently
731
* dirtied buffers don't. After all, we don't want fsync to last
732
* forever if somebody is actively writing to the file.
733
*
734
* Do this in two main stages: first we copy dirty buffers to a
735
* temporary inode list, queueing the writes as we go. Then we clean
736
* up, waiting for those writes to complete.
737
*
738
* During this second stage, any subsequent updates to the file may end
739
* up refiling the buffer on the original inode's dirty list again, so
740
* there is a chance we will end up with a buffer queued for write but
741
* not yet completed on that list. So, as a final cleanup we go through
742
* the osync code to catch these locked, dirty buffers without requeuing
743
* any newly dirty buffers for write.
744
*/
745
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
746
{
747
struct buffer_head *bh;
748
struct list_head tmp;
749
struct address_space *mapping;
750
int err = 0, err2;
751
struct blk_plug plug;
752
753
INIT_LIST_HEAD(&tmp);
754
blk_start_plug(&plug);
755
756
spin_lock(lock);
757
while (!list_empty(list)) {
758
bh = BH_ENTRY(list->next);
759
mapping = bh->b_assoc_map;
760
__remove_assoc_queue(bh);
761
/* Avoid race with mark_buffer_dirty_inode() which does
762
* a lockless check and we rely on seeing the dirty bit */
763
smp_mb();
764
if (buffer_dirty(bh) || buffer_locked(bh)) {
765
list_add(&bh->b_assoc_buffers, &tmp);
766
bh->b_assoc_map = mapping;
767
if (buffer_dirty(bh)) {
768
get_bh(bh);
769
spin_unlock(lock);
770
/*
771
* Ensure any pending I/O completes so that
772
* write_dirty_buffer() actually writes the
773
* current contents - it is a noop if I/O is
774
* still in flight on potentially older
775
* contents.
776
*/
777
write_dirty_buffer(bh, WRITE_SYNC);
778
779
/*
780
* Kick off IO for the previous mapping. Note
781
* that we will not run the very last mapping,
782
* wait_on_buffer() will do that for us
783
* through sync_buffer().
784
*/
785
brelse(bh);
786
spin_lock(lock);
787
}
788
}
789
}
790
791
spin_unlock(lock);
792
blk_finish_plug(&plug);
793
spin_lock(lock);
794
795
while (!list_empty(&tmp)) {
796
bh = BH_ENTRY(tmp.prev);
797
get_bh(bh);
798
mapping = bh->b_assoc_map;
799
__remove_assoc_queue(bh);
800
/* Avoid race with mark_buffer_dirty_inode() which does
801
* a lockless check and we rely on seeing the dirty bit */
802
smp_mb();
803
if (buffer_dirty(bh)) {
804
list_add(&bh->b_assoc_buffers,
805
&mapping->private_list);
806
bh->b_assoc_map = mapping;
807
}
808
spin_unlock(lock);
809
wait_on_buffer(bh);
810
if (!buffer_uptodate(bh))
811
err = -EIO;
812
brelse(bh);
813
spin_lock(lock);
814
}
815
816
spin_unlock(lock);
817
err2 = osync_buffers_list(lock, list);
818
if (err)
819
return err;
820
else
821
return err2;
822
}
823
824
/*
825
* Invalidate any and all dirty buffers on a given inode. We are
826
* probably unmounting the fs, but that doesn't mean we have already
827
* done a sync(). Just drop the buffers from the inode list.
828
*
829
* NOTE: we take the inode's blockdev's mapping's private_lock. Which
830
* assumes that all the buffers are against the blockdev. Not true
831
* for reiserfs.
832
*/
833
void invalidate_inode_buffers(struct inode *inode)
834
{
835
if (inode_has_buffers(inode)) {
836
struct address_space *mapping = &inode->i_data;
837
struct list_head *list = &mapping->private_list;
838
struct address_space *buffer_mapping = mapping->assoc_mapping;
839
840
spin_lock(&buffer_mapping->private_lock);
841
while (!list_empty(list))
842
__remove_assoc_queue(BH_ENTRY(list->next));
843
spin_unlock(&buffer_mapping->private_lock);
844
}
845
}
846
EXPORT_SYMBOL(invalidate_inode_buffers);
847
848
/*
849
* Remove any clean buffers from the inode's buffer list. This is called
850
* when we're trying to free the inode itself. Those buffers can pin it.
851
*
852
* Returns true if all buffers were removed.
853
*/
854
int remove_inode_buffers(struct inode *inode)
855
{
856
int ret = 1;
857
858
if (inode_has_buffers(inode)) {
859
struct address_space *mapping = &inode->i_data;
860
struct list_head *list = &mapping->private_list;
861
struct address_space *buffer_mapping = mapping->assoc_mapping;
862
863
spin_lock(&buffer_mapping->private_lock);
864
while (!list_empty(list)) {
865
struct buffer_head *bh = BH_ENTRY(list->next);
866
if (buffer_dirty(bh)) {
867
ret = 0;
868
break;
869
}
870
__remove_assoc_queue(bh);
871
}
872
spin_unlock(&buffer_mapping->private_lock);
873
}
874
return ret;
875
}
876
877
/*
878
* Create the appropriate buffers when given a page for data area and
879
* the size of each buffer.. Use the bh->b_this_page linked list to
880
* follow the buffers created. Return NULL if unable to create more
881
* buffers.
882
*
883
* The retry flag is used to differentiate async IO (paging, swapping)
884
* which may not fail from ordinary buffer allocations.
885
*/
886
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
887
int retry)
888
{
889
struct buffer_head *bh, *head;
890
long offset;
891
892
try_again:
893
head = NULL;
894
offset = PAGE_SIZE;
895
while ((offset -= size) >= 0) {
896
bh = alloc_buffer_head(GFP_NOFS);
897
if (!bh)
898
goto no_grow;
899
900
bh->b_bdev = NULL;
901
bh->b_this_page = head;
902
bh->b_blocknr = -1;
903
head = bh;
904
905
bh->b_state = 0;
906
atomic_set(&bh->b_count, 0);
907
bh->b_size = size;
908
909
/* Link the buffer to its page */
910
set_bh_page(bh, page, offset);
911
912
init_buffer(bh, NULL, NULL);
913
}
914
return head;
915
/*
916
* In case anything failed, we just free everything we got.
917
*/
918
no_grow:
919
if (head) {
920
do {
921
bh = head;
922
head = head->b_this_page;
923
free_buffer_head(bh);
924
} while (head);
925
}
926
927
/*
928
* Return failure for non-async IO requests. Async IO requests
929
* are not allowed to fail, so we have to wait until buffer heads
930
* become available. But we don't want tasks sleeping with
931
* partially complete buffers, so all were released above.
932
*/
933
if (!retry)
934
return NULL;
935
936
/* We're _really_ low on memory. Now we just
937
* wait for old buffer heads to become free due to
938
* finishing IO. Since this is an async request and
939
* the reserve list is empty, we're sure there are
940
* async buffer heads in use.
941
*/
942
free_more_memory();
943
goto try_again;
944
}
945
EXPORT_SYMBOL_GPL(alloc_page_buffers);
946
947
static inline void
948
link_dev_buffers(struct page *page, struct buffer_head *head)
949
{
950
struct buffer_head *bh, *tail;
951
952
bh = head;
953
do {
954
tail = bh;
955
bh = bh->b_this_page;
956
} while (bh);
957
tail->b_this_page = head;
958
attach_page_buffers(page, head);
959
}
960
961
/*
962
* Initialise the state of a blockdev page's buffers.
963
*/
964
static void
965
init_page_buffers(struct page *page, struct block_device *bdev,
966
sector_t block, int size)
967
{
968
struct buffer_head *head = page_buffers(page);
969
struct buffer_head *bh = head;
970
int uptodate = PageUptodate(page);
971
972
do {
973
if (!buffer_mapped(bh)) {
974
init_buffer(bh, NULL, NULL);
975
bh->b_bdev = bdev;
976
bh->b_blocknr = block;
977
if (uptodate)
978
set_buffer_uptodate(bh);
979
set_buffer_mapped(bh);
980
}
981
block++;
982
bh = bh->b_this_page;
983
} while (bh != head);
984
}
985
986
/*
987
* Create the page-cache page that contains the requested block.
988
*
989
* This is user purely for blockdev mappings.
990
*/
991
static struct page *
992
grow_dev_page(struct block_device *bdev, sector_t block,
993
pgoff_t index, int size)
994
{
995
struct inode *inode = bdev->bd_inode;
996
struct page *page;
997
struct buffer_head *bh;
998
999
page = find_or_create_page(inode->i_mapping, index,
1000
(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1001
if (!page)
1002
return NULL;
1003
1004
BUG_ON(!PageLocked(page));
1005
1006
if (page_has_buffers(page)) {
1007
bh = page_buffers(page);
1008
if (bh->b_size == size) {
1009
init_page_buffers(page, bdev, block, size);
1010
return page;
1011
}
1012
if (!try_to_free_buffers(page))
1013
goto failed;
1014
}
1015
1016
/*
1017
* Allocate some buffers for this page
1018
*/
1019
bh = alloc_page_buffers(page, size, 0);
1020
if (!bh)
1021
goto failed;
1022
1023
/*
1024
* Link the page to the buffers and initialise them. Take the
1025
* lock to be atomic wrt __find_get_block(), which does not
1026
* run under the page lock.
1027
*/
1028
spin_lock(&inode->i_mapping->private_lock);
1029
link_dev_buffers(page, bh);
1030
init_page_buffers(page, bdev, block, size);
1031
spin_unlock(&inode->i_mapping->private_lock);
1032
return page;
1033
1034
failed:
1035
BUG();
1036
unlock_page(page);
1037
page_cache_release(page);
1038
return NULL;
1039
}
1040
1041
/*
1042
* Create buffers for the specified block device block's page. If
1043
* that page was dirty, the buffers are set dirty also.
1044
*/
1045
static int
1046
grow_buffers(struct block_device *bdev, sector_t block, int size)
1047
{
1048
struct page *page;
1049
pgoff_t index;
1050
int sizebits;
1051
1052
sizebits = -1;
1053
do {
1054
sizebits++;
1055
} while ((size << sizebits) < PAGE_SIZE);
1056
1057
index = block >> sizebits;
1058
1059
/*
1060
* Check for a block which wants to lie outside our maximum possible
1061
* pagecache index. (this comparison is done using sector_t types).
1062
*/
1063
if (unlikely(index != block >> sizebits)) {
1064
char b[BDEVNAME_SIZE];
1065
1066
printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067
"device %s\n",
1068
__func__, (unsigned long long)block,
1069
bdevname(bdev, b));
1070
return -EIO;
1071
}
1072
block = index << sizebits;
1073
/* Create a page with the proper size buffers.. */
1074
page = grow_dev_page(bdev, block, index, size);
1075
if (!page)
1076
return 0;
1077
unlock_page(page);
1078
page_cache_release(page);
1079
return 1;
1080
}
1081
1082
static struct buffer_head *
1083
__getblk_slow(struct block_device *bdev, sector_t block, int size)
1084
{
1085
/* Size must be multiple of hard sectorsize */
1086
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087
(size < 512 || size > PAGE_SIZE))) {
1088
printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089
size);
1090
printk(KERN_ERR "logical block size: %d\n",
1091
bdev_logical_block_size(bdev));
1092
1093
dump_stack();
1094
return NULL;
1095
}
1096
1097
for (;;) {
1098
struct buffer_head * bh;
1099
int ret;
1100
1101
bh = __find_get_block(bdev, block, size);
1102
if (bh)
1103
return bh;
1104
1105
ret = grow_buffers(bdev, block, size);
1106
if (ret < 0)
1107
return NULL;
1108
if (ret == 0)
1109
free_more_memory();
1110
}
1111
}
1112
1113
/*
1114
* The relationship between dirty buffers and dirty pages:
1115
*
1116
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117
* the page is tagged dirty in its radix tree.
1118
*
1119
* At all times, the dirtiness of the buffers represents the dirtiness of
1120
* subsections of the page. If the page has buffers, the page dirty bit is
1121
* merely a hint about the true dirty state.
1122
*
1123
* When a page is set dirty in its entirety, all its buffers are marked dirty
1124
* (if the page has buffers).
1125
*
1126
* When a buffer is marked dirty, its page is dirtied, but the page's other
1127
* buffers are not.
1128
*
1129
* Also. When blockdev buffers are explicitly read with bread(), they
1130
* individually become uptodate. But their backing page remains not
1131
* uptodate - even if all of its buffers are uptodate. A subsequent
1132
* block_read_full_page() against that page will discover all the uptodate
1133
* buffers, will set the page uptodate and will perform no I/O.
1134
*/
1135
1136
/**
1137
* mark_buffer_dirty - mark a buffer_head as needing writeout
1138
* @bh: the buffer_head to mark dirty
1139
*
1140
* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141
* backing page dirty, then tag the page as dirty in its address_space's radix
1142
* tree and then attach the address_space's inode to its superblock's dirty
1143
* inode list.
1144
*
1145
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1146
* mapping->tree_lock and mapping->host->i_lock.
1147
*/
1148
void mark_buffer_dirty(struct buffer_head *bh)
1149
{
1150
WARN_ON_ONCE(!buffer_uptodate(bh));
1151
1152
/*
1153
* Very *carefully* optimize the it-is-already-dirty case.
1154
*
1155
* Don't let the final "is it dirty" escape to before we
1156
* perhaps modified the buffer.
1157
*/
1158
if (buffer_dirty(bh)) {
1159
smp_mb();
1160
if (buffer_dirty(bh))
1161
return;
1162
}
1163
1164
if (!test_set_buffer_dirty(bh)) {
1165
struct page *page = bh->b_page;
1166
if (!TestSetPageDirty(page)) {
1167
struct address_space *mapping = page_mapping(page);
1168
if (mapping)
1169
__set_page_dirty(page, mapping, 0);
1170
}
1171
}
1172
}
1173
EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175
/*
1176
* Decrement a buffer_head's reference count. If all buffers against a page
1177
* have zero reference count, are clean and unlocked, and if the page is clean
1178
* and unlocked then try_to_free_buffers() may strip the buffers from the page
1179
* in preparation for freeing it (sometimes, rarely, buffers are removed from
1180
* a page but it ends up not being freed, and buffers may later be reattached).
1181
*/
1182
void __brelse(struct buffer_head * buf)
1183
{
1184
if (atomic_read(&buf->b_count)) {
1185
put_bh(buf);
1186
return;
1187
}
1188
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189
}
1190
EXPORT_SYMBOL(__brelse);
1191
1192
/*
1193
* bforget() is like brelse(), except it discards any
1194
* potentially dirty data.
1195
*/
1196
void __bforget(struct buffer_head *bh)
1197
{
1198
clear_buffer_dirty(bh);
1199
if (bh->b_assoc_map) {
1200
struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202
spin_lock(&buffer_mapping->private_lock);
1203
list_del_init(&bh->b_assoc_buffers);
1204
bh->b_assoc_map = NULL;
1205
spin_unlock(&buffer_mapping->private_lock);
1206
}
1207
__brelse(bh);
1208
}
1209
EXPORT_SYMBOL(__bforget);
1210
1211
static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212
{
1213
lock_buffer(bh);
1214
if (buffer_uptodate(bh)) {
1215
unlock_buffer(bh);
1216
return bh;
1217
} else {
1218
get_bh(bh);
1219
bh->b_end_io = end_buffer_read_sync;
1220
submit_bh(READ, bh);
1221
wait_on_buffer(bh);
1222
if (buffer_uptodate(bh))
1223
return bh;
1224
}
1225
brelse(bh);
1226
return NULL;
1227
}
1228
1229
/*
1230
* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1231
* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1232
* refcount elevated by one when they're in an LRU. A buffer can only appear
1233
* once in a particular CPU's LRU. A single buffer can be present in multiple
1234
* CPU's LRUs at the same time.
1235
*
1236
* This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237
* sb_find_get_block().
1238
*
1239
* The LRUs themselves only need locking against invalidate_bh_lrus. We use
1240
* a local interrupt disable for that.
1241
*/
1242
1243
#define BH_LRU_SIZE 8
1244
1245
struct bh_lru {
1246
struct buffer_head *bhs[BH_LRU_SIZE];
1247
};
1248
1249
static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251
#ifdef CONFIG_SMP
1252
#define bh_lru_lock() local_irq_disable()
1253
#define bh_lru_unlock() local_irq_enable()
1254
#else
1255
#define bh_lru_lock() preempt_disable()
1256
#define bh_lru_unlock() preempt_enable()
1257
#endif
1258
1259
static inline void check_irqs_on(void)
1260
{
1261
#ifdef irqs_disabled
1262
BUG_ON(irqs_disabled());
1263
#endif
1264
}
1265
1266
/*
1267
* The LRU management algorithm is dopey-but-simple. Sorry.
1268
*/
1269
static void bh_lru_install(struct buffer_head *bh)
1270
{
1271
struct buffer_head *evictee = NULL;
1272
1273
check_irqs_on();
1274
bh_lru_lock();
1275
if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1276
struct buffer_head *bhs[BH_LRU_SIZE];
1277
int in;
1278
int out = 0;
1279
1280
get_bh(bh);
1281
bhs[out++] = bh;
1282
for (in = 0; in < BH_LRU_SIZE; in++) {
1283
struct buffer_head *bh2 =
1284
__this_cpu_read(bh_lrus.bhs[in]);
1285
1286
if (bh2 == bh) {
1287
__brelse(bh2);
1288
} else {
1289
if (out >= BH_LRU_SIZE) {
1290
BUG_ON(evictee != NULL);
1291
evictee = bh2;
1292
} else {
1293
bhs[out++] = bh2;
1294
}
1295
}
1296
}
1297
while (out < BH_LRU_SIZE)
1298
bhs[out++] = NULL;
1299
memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1300
}
1301
bh_lru_unlock();
1302
1303
if (evictee)
1304
__brelse(evictee);
1305
}
1306
1307
/*
1308
* Look up the bh in this cpu's LRU. If it's there, move it to the head.
1309
*/
1310
static struct buffer_head *
1311
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1312
{
1313
struct buffer_head *ret = NULL;
1314
unsigned int i;
1315
1316
check_irqs_on();
1317
bh_lru_lock();
1318
for (i = 0; i < BH_LRU_SIZE; i++) {
1319
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1320
1321
if (bh && bh->b_bdev == bdev &&
1322
bh->b_blocknr == block && bh->b_size == size) {
1323
if (i) {
1324
while (i) {
1325
__this_cpu_write(bh_lrus.bhs[i],
1326
__this_cpu_read(bh_lrus.bhs[i - 1]));
1327
i--;
1328
}
1329
__this_cpu_write(bh_lrus.bhs[0], bh);
1330
}
1331
get_bh(bh);
1332
ret = bh;
1333
break;
1334
}
1335
}
1336
bh_lru_unlock();
1337
return ret;
1338
}
1339
1340
/*
1341
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
1342
* it in the LRU and mark it as accessed. If it is not present then return
1343
* NULL
1344
*/
1345
struct buffer_head *
1346
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1347
{
1348
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1349
1350
if (bh == NULL) {
1351
bh = __find_get_block_slow(bdev, block);
1352
if (bh)
1353
bh_lru_install(bh);
1354
}
1355
if (bh)
1356
touch_buffer(bh);
1357
return bh;
1358
}
1359
EXPORT_SYMBOL(__find_get_block);
1360
1361
/*
1362
* __getblk will locate (and, if necessary, create) the buffer_head
1363
* which corresponds to the passed block_device, block and size. The
1364
* returned buffer has its reference count incremented.
1365
*
1366
* __getblk() cannot fail - it just keeps trying. If you pass it an
1367
* illegal block number, __getblk() will happily return a buffer_head
1368
* which represents the non-existent block. Very weird.
1369
*
1370
* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1371
* attempt is failing. FIXME, perhaps?
1372
*/
1373
struct buffer_head *
1374
__getblk(struct block_device *bdev, sector_t block, unsigned size)
1375
{
1376
struct buffer_head *bh = __find_get_block(bdev, block, size);
1377
1378
might_sleep();
1379
if (bh == NULL)
1380
bh = __getblk_slow(bdev, block, size);
1381
return bh;
1382
}
1383
EXPORT_SYMBOL(__getblk);
1384
1385
/*
1386
* Do async read-ahead on a buffer..
1387
*/
1388
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1389
{
1390
struct buffer_head *bh = __getblk(bdev, block, size);
1391
if (likely(bh)) {
1392
ll_rw_block(READA, 1, &bh);
1393
brelse(bh);
1394
}
1395
}
1396
EXPORT_SYMBOL(__breadahead);
1397
1398
/**
1399
* __bread() - reads a specified block and returns the bh
1400
* @bdev: the block_device to read from
1401
* @block: number of block
1402
* @size: size (in bytes) to read
1403
*
1404
* Reads a specified block, and returns buffer head that contains it.
1405
* It returns NULL if the block was unreadable.
1406
*/
1407
struct buffer_head *
1408
__bread(struct block_device *bdev, sector_t block, unsigned size)
1409
{
1410
struct buffer_head *bh = __getblk(bdev, block, size);
1411
1412
if (likely(bh) && !buffer_uptodate(bh))
1413
bh = __bread_slow(bh);
1414
return bh;
1415
}
1416
EXPORT_SYMBOL(__bread);
1417
1418
/*
1419
* invalidate_bh_lrus() is called rarely - but not only at unmount.
1420
* This doesn't race because it runs in each cpu either in irq
1421
* or with preempt disabled.
1422
*/
1423
static void invalidate_bh_lru(void *arg)
1424
{
1425
struct bh_lru *b = &get_cpu_var(bh_lrus);
1426
int i;
1427
1428
for (i = 0; i < BH_LRU_SIZE; i++) {
1429
brelse(b->bhs[i]);
1430
b->bhs[i] = NULL;
1431
}
1432
put_cpu_var(bh_lrus);
1433
}
1434
1435
void invalidate_bh_lrus(void)
1436
{
1437
on_each_cpu(invalidate_bh_lru, NULL, 1);
1438
}
1439
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1440
1441
void set_bh_page(struct buffer_head *bh,
1442
struct page *page, unsigned long offset)
1443
{
1444
bh->b_page = page;
1445
BUG_ON(offset >= PAGE_SIZE);
1446
if (PageHighMem(page))
1447
/*
1448
* This catches illegal uses and preserves the offset:
1449
*/
1450
bh->b_data = (char *)(0 + offset);
1451
else
1452
bh->b_data = page_address(page) + offset;
1453
}
1454
EXPORT_SYMBOL(set_bh_page);
1455
1456
/*
1457
* Called when truncating a buffer on a page completely.
1458
*/
1459
static void discard_buffer(struct buffer_head * bh)
1460
{
1461
lock_buffer(bh);
1462
clear_buffer_dirty(bh);
1463
bh->b_bdev = NULL;
1464
clear_buffer_mapped(bh);
1465
clear_buffer_req(bh);
1466
clear_buffer_new(bh);
1467
clear_buffer_delay(bh);
1468
clear_buffer_unwritten(bh);
1469
unlock_buffer(bh);
1470
}
1471
1472
/**
1473
* block_invalidatepage - invalidate part of all of a buffer-backed page
1474
*
1475
* @page: the page which is affected
1476
* @offset: the index of the truncation point
1477
*
1478
* block_invalidatepage() is called when all or part of the page has become
1479
* invalidatedby a truncate operation.
1480
*
1481
* block_invalidatepage() does not have to release all buffers, but it must
1482
* ensure that no dirty buffer is left outside @offset and that no I/O
1483
* is underway against any of the blocks which are outside the truncation
1484
* point. Because the caller is about to free (and possibly reuse) those
1485
* blocks on-disk.
1486
*/
1487
void block_invalidatepage(struct page *page, unsigned long offset)
1488
{
1489
struct buffer_head *head, *bh, *next;
1490
unsigned int curr_off = 0;
1491
1492
BUG_ON(!PageLocked(page));
1493
if (!page_has_buffers(page))
1494
goto out;
1495
1496
head = page_buffers(page);
1497
bh = head;
1498
do {
1499
unsigned int next_off = curr_off + bh->b_size;
1500
next = bh->b_this_page;
1501
1502
/*
1503
* is this block fully invalidated?
1504
*/
1505
if (offset <= curr_off)
1506
discard_buffer(bh);
1507
curr_off = next_off;
1508
bh = next;
1509
} while (bh != head);
1510
1511
/*
1512
* We release buffers only if the entire page is being invalidated.
1513
* The get_block cached value has been unconditionally invalidated,
1514
* so real IO is not possible anymore.
1515
*/
1516
if (offset == 0)
1517
try_to_release_page(page, 0);
1518
out:
1519
return;
1520
}
1521
EXPORT_SYMBOL(block_invalidatepage);
1522
1523
/*
1524
* We attach and possibly dirty the buffers atomically wrt
1525
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1526
* is already excluded via the page lock.
1527
*/
1528
void create_empty_buffers(struct page *page,
1529
unsigned long blocksize, unsigned long b_state)
1530
{
1531
struct buffer_head *bh, *head, *tail;
1532
1533
head = alloc_page_buffers(page, blocksize, 1);
1534
bh = head;
1535
do {
1536
bh->b_state |= b_state;
1537
tail = bh;
1538
bh = bh->b_this_page;
1539
} while (bh);
1540
tail->b_this_page = head;
1541
1542
spin_lock(&page->mapping->private_lock);
1543
if (PageUptodate(page) || PageDirty(page)) {
1544
bh = head;
1545
do {
1546
if (PageDirty(page))
1547
set_buffer_dirty(bh);
1548
if (PageUptodate(page))
1549
set_buffer_uptodate(bh);
1550
bh = bh->b_this_page;
1551
} while (bh != head);
1552
}
1553
attach_page_buffers(page, head);
1554
spin_unlock(&page->mapping->private_lock);
1555
}
1556
EXPORT_SYMBOL(create_empty_buffers);
1557
1558
/*
1559
* We are taking a block for data and we don't want any output from any
1560
* buffer-cache aliases starting from return from that function and
1561
* until the moment when something will explicitly mark the buffer
1562
* dirty (hopefully that will not happen until we will free that block ;-)
1563
* We don't even need to mark it not-uptodate - nobody can expect
1564
* anything from a newly allocated buffer anyway. We used to used
1565
* unmap_buffer() for such invalidation, but that was wrong. We definitely
1566
* don't want to mark the alias unmapped, for example - it would confuse
1567
* anyone who might pick it with bread() afterwards...
1568
*
1569
* Also.. Note that bforget() doesn't lock the buffer. So there can
1570
* be writeout I/O going on against recently-freed buffers. We don't
1571
* wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572
* only if we really need to. That happens here.
1573
*/
1574
void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575
{
1576
struct buffer_head *old_bh;
1577
1578
might_sleep();
1579
1580
old_bh = __find_get_block_slow(bdev, block);
1581
if (old_bh) {
1582
clear_buffer_dirty(old_bh);
1583
wait_on_buffer(old_bh);
1584
clear_buffer_req(old_bh);
1585
__brelse(old_bh);
1586
}
1587
}
1588
EXPORT_SYMBOL(unmap_underlying_metadata);
1589
1590
/*
1591
* NOTE! All mapped/uptodate combinations are valid:
1592
*
1593
* Mapped Uptodate Meaning
1594
*
1595
* No No "unknown" - must do get_block()
1596
* No Yes "hole" - zero-filled
1597
* Yes No "allocated" - allocated on disk, not read in
1598
* Yes Yes "valid" - allocated and up-to-date in memory.
1599
*
1600
* "Dirty" is valid only with the last case (mapped+uptodate).
1601
*/
1602
1603
/*
1604
* While block_write_full_page is writing back the dirty buffers under
1605
* the page lock, whoever dirtied the buffers may decide to clean them
1606
* again at any time. We handle that by only looking at the buffer
1607
* state inside lock_buffer().
1608
*
1609
* If block_write_full_page() is called for regular writeback
1610
* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611
* locked buffer. This only can happen if someone has written the buffer
1612
* directly, with submit_bh(). At the address_space level PageWriteback
1613
* prevents this contention from occurring.
1614
*
1615
* If block_write_full_page() is called with wbc->sync_mode ==
1616
* WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1617
* causes the writes to be flagged as synchronous writes.
1618
*/
1619
static int __block_write_full_page(struct inode *inode, struct page *page,
1620
get_block_t *get_block, struct writeback_control *wbc,
1621
bh_end_io_t *handler)
1622
{
1623
int err;
1624
sector_t block;
1625
sector_t last_block;
1626
struct buffer_head *bh, *head;
1627
const unsigned blocksize = 1 << inode->i_blkbits;
1628
int nr_underway = 0;
1629
int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1630
WRITE_SYNC : WRITE);
1631
1632
BUG_ON(!PageLocked(page));
1633
1634
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1635
1636
if (!page_has_buffers(page)) {
1637
create_empty_buffers(page, blocksize,
1638
(1 << BH_Dirty)|(1 << BH_Uptodate));
1639
}
1640
1641
/*
1642
* Be very careful. We have no exclusion from __set_page_dirty_buffers
1643
* here, and the (potentially unmapped) buffers may become dirty at
1644
* any time. If a buffer becomes dirty here after we've inspected it
1645
* then we just miss that fact, and the page stays dirty.
1646
*
1647
* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1648
* handle that here by just cleaning them.
1649
*/
1650
1651
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1652
head = page_buffers(page);
1653
bh = head;
1654
1655
/*
1656
* Get all the dirty buffers mapped to disk addresses and
1657
* handle any aliases from the underlying blockdev's mapping.
1658
*/
1659
do {
1660
if (block > last_block) {
1661
/*
1662
* mapped buffers outside i_size will occur, because
1663
* this page can be outside i_size when there is a
1664
* truncate in progress.
1665
*/
1666
/*
1667
* The buffer was zeroed by block_write_full_page()
1668
*/
1669
clear_buffer_dirty(bh);
1670
set_buffer_uptodate(bh);
1671
} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1672
buffer_dirty(bh)) {
1673
WARN_ON(bh->b_size != blocksize);
1674
err = get_block(inode, block, bh, 1);
1675
if (err)
1676
goto recover;
1677
clear_buffer_delay(bh);
1678
if (buffer_new(bh)) {
1679
/* blockdev mappings never come here */
1680
clear_buffer_new(bh);
1681
unmap_underlying_metadata(bh->b_bdev,
1682
bh->b_blocknr);
1683
}
1684
}
1685
bh = bh->b_this_page;
1686
block++;
1687
} while (bh != head);
1688
1689
do {
1690
if (!buffer_mapped(bh))
1691
continue;
1692
/*
1693
* If it's a fully non-blocking write attempt and we cannot
1694
* lock the buffer then redirty the page. Note that this can
1695
* potentially cause a busy-wait loop from writeback threads
1696
* and kswapd activity, but those code paths have their own
1697
* higher-level throttling.
1698
*/
1699
if (wbc->sync_mode != WB_SYNC_NONE) {
1700
lock_buffer(bh);
1701
} else if (!trylock_buffer(bh)) {
1702
redirty_page_for_writepage(wbc, page);
1703
continue;
1704
}
1705
if (test_clear_buffer_dirty(bh)) {
1706
mark_buffer_async_write_endio(bh, handler);
1707
} else {
1708
unlock_buffer(bh);
1709
}
1710
} while ((bh = bh->b_this_page) != head);
1711
1712
/*
1713
* The page and its buffers are protected by PageWriteback(), so we can
1714
* drop the bh refcounts early.
1715
*/
1716
BUG_ON(PageWriteback(page));
1717
set_page_writeback(page);
1718
1719
do {
1720
struct buffer_head *next = bh->b_this_page;
1721
if (buffer_async_write(bh)) {
1722
submit_bh(write_op, bh);
1723
nr_underway++;
1724
}
1725
bh = next;
1726
} while (bh != head);
1727
unlock_page(page);
1728
1729
err = 0;
1730
done:
1731
if (nr_underway == 0) {
1732
/*
1733
* The page was marked dirty, but the buffers were
1734
* clean. Someone wrote them back by hand with
1735
* ll_rw_block/submit_bh. A rare case.
1736
*/
1737
end_page_writeback(page);
1738
1739
/*
1740
* The page and buffer_heads can be released at any time from
1741
* here on.
1742
*/
1743
}
1744
return err;
1745
1746
recover:
1747
/*
1748
* ENOSPC, or some other error. We may already have added some
1749
* blocks to the file, so we need to write these out to avoid
1750
* exposing stale data.
1751
* The page is currently locked and not marked for writeback
1752
*/
1753
bh = head;
1754
/* Recovery: lock and submit the mapped buffers */
1755
do {
1756
if (buffer_mapped(bh) && buffer_dirty(bh) &&
1757
!buffer_delay(bh)) {
1758
lock_buffer(bh);
1759
mark_buffer_async_write_endio(bh, handler);
1760
} else {
1761
/*
1762
* The buffer may have been set dirty during
1763
* attachment to a dirty page.
1764
*/
1765
clear_buffer_dirty(bh);
1766
}
1767
} while ((bh = bh->b_this_page) != head);
1768
SetPageError(page);
1769
BUG_ON(PageWriteback(page));
1770
mapping_set_error(page->mapping, err);
1771
set_page_writeback(page);
1772
do {
1773
struct buffer_head *next = bh->b_this_page;
1774
if (buffer_async_write(bh)) {
1775
clear_buffer_dirty(bh);
1776
submit_bh(write_op, bh);
1777
nr_underway++;
1778
}
1779
bh = next;
1780
} while (bh != head);
1781
unlock_page(page);
1782
goto done;
1783
}
1784
1785
/*
1786
* If a page has any new buffers, zero them out here, and mark them uptodate
1787
* and dirty so they'll be written out (in order to prevent uninitialised
1788
* block data from leaking). And clear the new bit.
1789
*/
1790
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1791
{
1792
unsigned int block_start, block_end;
1793
struct buffer_head *head, *bh;
1794
1795
BUG_ON(!PageLocked(page));
1796
if (!page_has_buffers(page))
1797
return;
1798
1799
bh = head = page_buffers(page);
1800
block_start = 0;
1801
do {
1802
block_end = block_start + bh->b_size;
1803
1804
if (buffer_new(bh)) {
1805
if (block_end > from && block_start < to) {
1806
if (!PageUptodate(page)) {
1807
unsigned start, size;
1808
1809
start = max(from, block_start);
1810
size = min(to, block_end) - start;
1811
1812
zero_user(page, start, size);
1813
set_buffer_uptodate(bh);
1814
}
1815
1816
clear_buffer_new(bh);
1817
mark_buffer_dirty(bh);
1818
}
1819
}
1820
1821
block_start = block_end;
1822
bh = bh->b_this_page;
1823
} while (bh != head);
1824
}
1825
EXPORT_SYMBOL(page_zero_new_buffers);
1826
1827
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1828
get_block_t *get_block)
1829
{
1830
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1831
unsigned to = from + len;
1832
struct inode *inode = page->mapping->host;
1833
unsigned block_start, block_end;
1834
sector_t block;
1835
int err = 0;
1836
unsigned blocksize, bbits;
1837
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1838
1839
BUG_ON(!PageLocked(page));
1840
BUG_ON(from > PAGE_CACHE_SIZE);
1841
BUG_ON(to > PAGE_CACHE_SIZE);
1842
BUG_ON(from > to);
1843
1844
blocksize = 1 << inode->i_blkbits;
1845
if (!page_has_buffers(page))
1846
create_empty_buffers(page, blocksize, 0);
1847
head = page_buffers(page);
1848
1849
bbits = inode->i_blkbits;
1850
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1851
1852
for(bh = head, block_start = 0; bh != head || !block_start;
1853
block++, block_start=block_end, bh = bh->b_this_page) {
1854
block_end = block_start + blocksize;
1855
if (block_end <= from || block_start >= to) {
1856
if (PageUptodate(page)) {
1857
if (!buffer_uptodate(bh))
1858
set_buffer_uptodate(bh);
1859
}
1860
continue;
1861
}
1862
if (buffer_new(bh))
1863
clear_buffer_new(bh);
1864
if (!buffer_mapped(bh)) {
1865
WARN_ON(bh->b_size != blocksize);
1866
err = get_block(inode, block, bh, 1);
1867
if (err)
1868
break;
1869
if (buffer_new(bh)) {
1870
unmap_underlying_metadata(bh->b_bdev,
1871
bh->b_blocknr);
1872
if (PageUptodate(page)) {
1873
clear_buffer_new(bh);
1874
set_buffer_uptodate(bh);
1875
mark_buffer_dirty(bh);
1876
continue;
1877
}
1878
if (block_end > to || block_start < from)
1879
zero_user_segments(page,
1880
to, block_end,
1881
block_start, from);
1882
continue;
1883
}
1884
}
1885
if (PageUptodate(page)) {
1886
if (!buffer_uptodate(bh))
1887
set_buffer_uptodate(bh);
1888
continue;
1889
}
1890
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1891
!buffer_unwritten(bh) &&
1892
(block_start < from || block_end > to)) {
1893
ll_rw_block(READ, 1, &bh);
1894
*wait_bh++=bh;
1895
}
1896
}
1897
/*
1898
* If we issued read requests - let them complete.
1899
*/
1900
while(wait_bh > wait) {
1901
wait_on_buffer(*--wait_bh);
1902
if (!buffer_uptodate(*wait_bh))
1903
err = -EIO;
1904
}
1905
if (unlikely(err))
1906
page_zero_new_buffers(page, from, to);
1907
return err;
1908
}
1909
EXPORT_SYMBOL(__block_write_begin);
1910
1911
static int __block_commit_write(struct inode *inode, struct page *page,
1912
unsigned from, unsigned to)
1913
{
1914
unsigned block_start, block_end;
1915
int partial = 0;
1916
unsigned blocksize;
1917
struct buffer_head *bh, *head;
1918
1919
blocksize = 1 << inode->i_blkbits;
1920
1921
for(bh = head = page_buffers(page), block_start = 0;
1922
bh != head || !block_start;
1923
block_start=block_end, bh = bh->b_this_page) {
1924
block_end = block_start + blocksize;
1925
if (block_end <= from || block_start >= to) {
1926
if (!buffer_uptodate(bh))
1927
partial = 1;
1928
} else {
1929
set_buffer_uptodate(bh);
1930
mark_buffer_dirty(bh);
1931
}
1932
clear_buffer_new(bh);
1933
}
1934
1935
/*
1936
* If this is a partial write which happened to make all buffers
1937
* uptodate then we can optimize away a bogus readpage() for
1938
* the next read(). Here we 'discover' whether the page went
1939
* uptodate as a result of this (potentially partial) write.
1940
*/
1941
if (!partial)
1942
SetPageUptodate(page);
1943
return 0;
1944
}
1945
1946
/*
1947
* block_write_begin takes care of the basic task of block allocation and
1948
* bringing partial write blocks uptodate first.
1949
*
1950
* The filesystem needs to handle block truncation upon failure.
1951
*/
1952
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1953
unsigned flags, struct page **pagep, get_block_t *get_block)
1954
{
1955
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1956
struct page *page;
1957
int status;
1958
1959
page = grab_cache_page_write_begin(mapping, index, flags);
1960
if (!page)
1961
return -ENOMEM;
1962
1963
status = __block_write_begin(page, pos, len, get_block);
1964
if (unlikely(status)) {
1965
unlock_page(page);
1966
page_cache_release(page);
1967
page = NULL;
1968
}
1969
1970
*pagep = page;
1971
return status;
1972
}
1973
EXPORT_SYMBOL(block_write_begin);
1974
1975
int block_write_end(struct file *file, struct address_space *mapping,
1976
loff_t pos, unsigned len, unsigned copied,
1977
struct page *page, void *fsdata)
1978
{
1979
struct inode *inode = mapping->host;
1980
unsigned start;
1981
1982
start = pos & (PAGE_CACHE_SIZE - 1);
1983
1984
if (unlikely(copied < len)) {
1985
/*
1986
* The buffers that were written will now be uptodate, so we
1987
* don't have to worry about a readpage reading them and
1988
* overwriting a partial write. However if we have encountered
1989
* a short write and only partially written into a buffer, it
1990
* will not be marked uptodate, so a readpage might come in and
1991
* destroy our partial write.
1992
*
1993
* Do the simplest thing, and just treat any short write to a
1994
* non uptodate page as a zero-length write, and force the
1995
* caller to redo the whole thing.
1996
*/
1997
if (!PageUptodate(page))
1998
copied = 0;
1999
2000
page_zero_new_buffers(page, start+copied, start+len);
2001
}
2002
flush_dcache_page(page);
2003
2004
/* This could be a short (even 0-length) commit */
2005
__block_commit_write(inode, page, start, start+copied);
2006
2007
return copied;
2008
}
2009
EXPORT_SYMBOL(block_write_end);
2010
2011
int generic_write_end(struct file *file, struct address_space *mapping,
2012
loff_t pos, unsigned len, unsigned copied,
2013
struct page *page, void *fsdata)
2014
{
2015
struct inode *inode = mapping->host;
2016
int i_size_changed = 0;
2017
2018
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2019
2020
/*
2021
* No need to use i_size_read() here, the i_size
2022
* cannot change under us because we hold i_mutex.
2023
*
2024
* But it's important to update i_size while still holding page lock:
2025
* page writeout could otherwise come in and zero beyond i_size.
2026
*/
2027
if (pos+copied > inode->i_size) {
2028
i_size_write(inode, pos+copied);
2029
i_size_changed = 1;
2030
}
2031
2032
unlock_page(page);
2033
page_cache_release(page);
2034
2035
/*
2036
* Don't mark the inode dirty under page lock. First, it unnecessarily
2037
* makes the holding time of page lock longer. Second, it forces lock
2038
* ordering of page lock and transaction start for journaling
2039
* filesystems.
2040
*/
2041
if (i_size_changed)
2042
mark_inode_dirty(inode);
2043
2044
return copied;
2045
}
2046
EXPORT_SYMBOL(generic_write_end);
2047
2048
/*
2049
* block_is_partially_uptodate checks whether buffers within a page are
2050
* uptodate or not.
2051
*
2052
* Returns true if all buffers which correspond to a file portion
2053
* we want to read are uptodate.
2054
*/
2055
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2056
unsigned long from)
2057
{
2058
struct inode *inode = page->mapping->host;
2059
unsigned block_start, block_end, blocksize;
2060
unsigned to;
2061
struct buffer_head *bh, *head;
2062
int ret = 1;
2063
2064
if (!page_has_buffers(page))
2065
return 0;
2066
2067
blocksize = 1 << inode->i_blkbits;
2068
to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2069
to = from + to;
2070
if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2071
return 0;
2072
2073
head = page_buffers(page);
2074
bh = head;
2075
block_start = 0;
2076
do {
2077
block_end = block_start + blocksize;
2078
if (block_end > from && block_start < to) {
2079
if (!buffer_uptodate(bh)) {
2080
ret = 0;
2081
break;
2082
}
2083
if (block_end >= to)
2084
break;
2085
}
2086
block_start = block_end;
2087
bh = bh->b_this_page;
2088
} while (bh != head);
2089
2090
return ret;
2091
}
2092
EXPORT_SYMBOL(block_is_partially_uptodate);
2093
2094
/*
2095
* Generic "read page" function for block devices that have the normal
2096
* get_block functionality. This is most of the block device filesystems.
2097
* Reads the page asynchronously --- the unlock_buffer() and
2098
* set/clear_buffer_uptodate() functions propagate buffer state into the
2099
* page struct once IO has completed.
2100
*/
2101
int block_read_full_page(struct page *page, get_block_t *get_block)
2102
{
2103
struct inode *inode = page->mapping->host;
2104
sector_t iblock, lblock;
2105
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2106
unsigned int blocksize;
2107
int nr, i;
2108
int fully_mapped = 1;
2109
2110
BUG_ON(!PageLocked(page));
2111
blocksize = 1 << inode->i_blkbits;
2112
if (!page_has_buffers(page))
2113
create_empty_buffers(page, blocksize, 0);
2114
head = page_buffers(page);
2115
2116
iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2117
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2118
bh = head;
2119
nr = 0;
2120
i = 0;
2121
2122
do {
2123
if (buffer_uptodate(bh))
2124
continue;
2125
2126
if (!buffer_mapped(bh)) {
2127
int err = 0;
2128
2129
fully_mapped = 0;
2130
if (iblock < lblock) {
2131
WARN_ON(bh->b_size != blocksize);
2132
err = get_block(inode, iblock, bh, 0);
2133
if (err)
2134
SetPageError(page);
2135
}
2136
if (!buffer_mapped(bh)) {
2137
zero_user(page, i * blocksize, blocksize);
2138
if (!err)
2139
set_buffer_uptodate(bh);
2140
continue;
2141
}
2142
/*
2143
* get_block() might have updated the buffer
2144
* synchronously
2145
*/
2146
if (buffer_uptodate(bh))
2147
continue;
2148
}
2149
arr[nr++] = bh;
2150
} while (i++, iblock++, (bh = bh->b_this_page) != head);
2151
2152
if (fully_mapped)
2153
SetPageMappedToDisk(page);
2154
2155
if (!nr) {
2156
/*
2157
* All buffers are uptodate - we can set the page uptodate
2158
* as well. But not if get_block() returned an error.
2159
*/
2160
if (!PageError(page))
2161
SetPageUptodate(page);
2162
unlock_page(page);
2163
return 0;
2164
}
2165
2166
/* Stage two: lock the buffers */
2167
for (i = 0; i < nr; i++) {
2168
bh = arr[i];
2169
lock_buffer(bh);
2170
mark_buffer_async_read(bh);
2171
}
2172
2173
/*
2174
* Stage 3: start the IO. Check for uptodateness
2175
* inside the buffer lock in case another process reading
2176
* the underlying blockdev brought it uptodate (the sct fix).
2177
*/
2178
for (i = 0; i < nr; i++) {
2179
bh = arr[i];
2180
if (buffer_uptodate(bh))
2181
end_buffer_async_read(bh, 1);
2182
else
2183
submit_bh(READ, bh);
2184
}
2185
return 0;
2186
}
2187
EXPORT_SYMBOL(block_read_full_page);
2188
2189
/* utility function for filesystems that need to do work on expanding
2190
* truncates. Uses filesystem pagecache writes to allow the filesystem to
2191
* deal with the hole.
2192
*/
2193
int generic_cont_expand_simple(struct inode *inode, loff_t size)
2194
{
2195
struct address_space *mapping = inode->i_mapping;
2196
struct page *page;
2197
void *fsdata;
2198
int err;
2199
2200
err = inode_newsize_ok(inode, size);
2201
if (err)
2202
goto out;
2203
2204
err = pagecache_write_begin(NULL, mapping, size, 0,
2205
AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2206
&page, &fsdata);
2207
if (err)
2208
goto out;
2209
2210
err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2211
BUG_ON(err > 0);
2212
2213
out:
2214
return err;
2215
}
2216
EXPORT_SYMBOL(generic_cont_expand_simple);
2217
2218
static int cont_expand_zero(struct file *file, struct address_space *mapping,
2219
loff_t pos, loff_t *bytes)
2220
{
2221
struct inode *inode = mapping->host;
2222
unsigned blocksize = 1 << inode->i_blkbits;
2223
struct page *page;
2224
void *fsdata;
2225
pgoff_t index, curidx;
2226
loff_t curpos;
2227
unsigned zerofrom, offset, len;
2228
int err = 0;
2229
2230
index = pos >> PAGE_CACHE_SHIFT;
2231
offset = pos & ~PAGE_CACHE_MASK;
2232
2233
while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2234
zerofrom = curpos & ~PAGE_CACHE_MASK;
2235
if (zerofrom & (blocksize-1)) {
2236
*bytes |= (blocksize-1);
2237
(*bytes)++;
2238
}
2239
len = PAGE_CACHE_SIZE - zerofrom;
2240
2241
err = pagecache_write_begin(file, mapping, curpos, len,
2242
AOP_FLAG_UNINTERRUPTIBLE,
2243
&page, &fsdata);
2244
if (err)
2245
goto out;
2246
zero_user(page, zerofrom, len);
2247
err = pagecache_write_end(file, mapping, curpos, len, len,
2248
page, fsdata);
2249
if (err < 0)
2250
goto out;
2251
BUG_ON(err != len);
2252
err = 0;
2253
2254
balance_dirty_pages_ratelimited(mapping);
2255
}
2256
2257
/* page covers the boundary, find the boundary offset */
2258
if (index == curidx) {
2259
zerofrom = curpos & ~PAGE_CACHE_MASK;
2260
/* if we will expand the thing last block will be filled */
2261
if (offset <= zerofrom) {
2262
goto out;
2263
}
2264
if (zerofrom & (blocksize-1)) {
2265
*bytes |= (blocksize-1);
2266
(*bytes)++;
2267
}
2268
len = offset - zerofrom;
2269
2270
err = pagecache_write_begin(file, mapping, curpos, len,
2271
AOP_FLAG_UNINTERRUPTIBLE,
2272
&page, &fsdata);
2273
if (err)
2274
goto out;
2275
zero_user(page, zerofrom, len);
2276
err = pagecache_write_end(file, mapping, curpos, len, len,
2277
page, fsdata);
2278
if (err < 0)
2279
goto out;
2280
BUG_ON(err != len);
2281
err = 0;
2282
}
2283
out:
2284
return err;
2285
}
2286
2287
/*
2288
* For moronic filesystems that do not allow holes in file.
2289
* We may have to extend the file.
2290
*/
2291
int cont_write_begin(struct file *file, struct address_space *mapping,
2292
loff_t pos, unsigned len, unsigned flags,
2293
struct page **pagep, void **fsdata,
2294
get_block_t *get_block, loff_t *bytes)
2295
{
2296
struct inode *inode = mapping->host;
2297
unsigned blocksize = 1 << inode->i_blkbits;
2298
unsigned zerofrom;
2299
int err;
2300
2301
err = cont_expand_zero(file, mapping, pos, bytes);
2302
if (err)
2303
return err;
2304
2305
zerofrom = *bytes & ~PAGE_CACHE_MASK;
2306
if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2307
*bytes |= (blocksize-1);
2308
(*bytes)++;
2309
}
2310
2311
return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2312
}
2313
EXPORT_SYMBOL(cont_write_begin);
2314
2315
int block_commit_write(struct page *page, unsigned from, unsigned to)
2316
{
2317
struct inode *inode = page->mapping->host;
2318
__block_commit_write(inode,page,from,to);
2319
return 0;
2320
}
2321
EXPORT_SYMBOL(block_commit_write);
2322
2323
/*
2324
* block_page_mkwrite() is not allowed to change the file size as it gets
2325
* called from a page fault handler when a page is first dirtied. Hence we must
2326
* be careful to check for EOF conditions here. We set the page up correctly
2327
* for a written page which means we get ENOSPC checking when writing into
2328
* holes and correct delalloc and unwritten extent mapping on filesystems that
2329
* support these features.
2330
*
2331
* We are not allowed to take the i_mutex here so we have to play games to
2332
* protect against truncate races as the page could now be beyond EOF. Because
2333
* truncate writes the inode size before removing pages, once we have the
2334
* page lock we can determine safely if the page is beyond EOF. If it is not
2335
* beyond EOF, then the page is guaranteed safe against truncation until we
2336
* unlock the page.
2337
*
2338
* Direct callers of this function should call vfs_check_frozen() so that page
2339
* fault does not busyloop until the fs is thawed.
2340
*/
2341
int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2342
get_block_t get_block)
2343
{
2344
struct page *page = vmf->page;
2345
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2346
unsigned long end;
2347
loff_t size;
2348
int ret;
2349
2350
lock_page(page);
2351
size = i_size_read(inode);
2352
if ((page->mapping != inode->i_mapping) ||
2353
(page_offset(page) > size)) {
2354
/* We overload EFAULT to mean page got truncated */
2355
ret = -EFAULT;
2356
goto out_unlock;
2357
}
2358
2359
/* page is wholly or partially inside EOF */
2360
if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2361
end = size & ~PAGE_CACHE_MASK;
2362
else
2363
end = PAGE_CACHE_SIZE;
2364
2365
ret = __block_write_begin(page, 0, end, get_block);
2366
if (!ret)
2367
ret = block_commit_write(page, 0, end);
2368
2369
if (unlikely(ret < 0))
2370
goto out_unlock;
2371
/*
2372
* Freezing in progress? We check after the page is marked dirty and
2373
* with page lock held so if the test here fails, we are sure freezing
2374
* code will wait during syncing until the page fault is done - at that
2375
* point page will be dirty and unlocked so freezing code will write it
2376
* and writeprotect it again.
2377
*/
2378
set_page_dirty(page);
2379
if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2380
ret = -EAGAIN;
2381
goto out_unlock;
2382
}
2383
wait_on_page_writeback(page);
2384
return 0;
2385
out_unlock:
2386
unlock_page(page);
2387
return ret;
2388
}
2389
EXPORT_SYMBOL(__block_page_mkwrite);
2390
2391
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2392
get_block_t get_block)
2393
{
2394
int ret;
2395
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2396
2397
/*
2398
* This check is racy but catches the common case. The check in
2399
* __block_page_mkwrite() is reliable.
2400
*/
2401
vfs_check_frozen(sb, SB_FREEZE_WRITE);
2402
ret = __block_page_mkwrite(vma, vmf, get_block);
2403
return block_page_mkwrite_return(ret);
2404
}
2405
EXPORT_SYMBOL(block_page_mkwrite);
2406
2407
/*
2408
* nobh_write_begin()'s prereads are special: the buffer_heads are freed
2409
* immediately, while under the page lock. So it needs a special end_io
2410
* handler which does not touch the bh after unlocking it.
2411
*/
2412
static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2413
{
2414
__end_buffer_read_notouch(bh, uptodate);
2415
}
2416
2417
/*
2418
* Attach the singly-linked list of buffers created by nobh_write_begin, to
2419
* the page (converting it to circular linked list and taking care of page
2420
* dirty races).
2421
*/
2422
static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2423
{
2424
struct buffer_head *bh;
2425
2426
BUG_ON(!PageLocked(page));
2427
2428
spin_lock(&page->mapping->private_lock);
2429
bh = head;
2430
do {
2431
if (PageDirty(page))
2432
set_buffer_dirty(bh);
2433
if (!bh->b_this_page)
2434
bh->b_this_page = head;
2435
bh = bh->b_this_page;
2436
} while (bh != head);
2437
attach_page_buffers(page, head);
2438
spin_unlock(&page->mapping->private_lock);
2439
}
2440
2441
/*
2442
* On entry, the page is fully not uptodate.
2443
* On exit the page is fully uptodate in the areas outside (from,to)
2444
* The filesystem needs to handle block truncation upon failure.
2445
*/
2446
int nobh_write_begin(struct address_space *mapping,
2447
loff_t pos, unsigned len, unsigned flags,
2448
struct page **pagep, void **fsdata,
2449
get_block_t *get_block)
2450
{
2451
struct inode *inode = mapping->host;
2452
const unsigned blkbits = inode->i_blkbits;
2453
const unsigned blocksize = 1 << blkbits;
2454
struct buffer_head *head, *bh;
2455
struct page *page;
2456
pgoff_t index;
2457
unsigned from, to;
2458
unsigned block_in_page;
2459
unsigned block_start, block_end;
2460
sector_t block_in_file;
2461
int nr_reads = 0;
2462
int ret = 0;
2463
int is_mapped_to_disk = 1;
2464
2465
index = pos >> PAGE_CACHE_SHIFT;
2466
from = pos & (PAGE_CACHE_SIZE - 1);
2467
to = from + len;
2468
2469
page = grab_cache_page_write_begin(mapping, index, flags);
2470
if (!page)
2471
return -ENOMEM;
2472
*pagep = page;
2473
*fsdata = NULL;
2474
2475
if (page_has_buffers(page)) {
2476
ret = __block_write_begin(page, pos, len, get_block);
2477
if (unlikely(ret))
2478
goto out_release;
2479
return ret;
2480
}
2481
2482
if (PageMappedToDisk(page))
2483
return 0;
2484
2485
/*
2486
* Allocate buffers so that we can keep track of state, and potentially
2487
* attach them to the page if an error occurs. In the common case of
2488
* no error, they will just be freed again without ever being attached
2489
* to the page (which is all OK, because we're under the page lock).
2490
*
2491
* Be careful: the buffer linked list is a NULL terminated one, rather
2492
* than the circular one we're used to.
2493
*/
2494
head = alloc_page_buffers(page, blocksize, 0);
2495
if (!head) {
2496
ret = -ENOMEM;
2497
goto out_release;
2498
}
2499
2500
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2501
2502
/*
2503
* We loop across all blocks in the page, whether or not they are
2504
* part of the affected region. This is so we can discover if the
2505
* page is fully mapped-to-disk.
2506
*/
2507
for (block_start = 0, block_in_page = 0, bh = head;
2508
block_start < PAGE_CACHE_SIZE;
2509
block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2510
int create;
2511
2512
block_end = block_start + blocksize;
2513
bh->b_state = 0;
2514
create = 1;
2515
if (block_start >= to)
2516
create = 0;
2517
ret = get_block(inode, block_in_file + block_in_page,
2518
bh, create);
2519
if (ret)
2520
goto failed;
2521
if (!buffer_mapped(bh))
2522
is_mapped_to_disk = 0;
2523
if (buffer_new(bh))
2524
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2525
if (PageUptodate(page)) {
2526
set_buffer_uptodate(bh);
2527
continue;
2528
}
2529
if (buffer_new(bh) || !buffer_mapped(bh)) {
2530
zero_user_segments(page, block_start, from,
2531
to, block_end);
2532
continue;
2533
}
2534
if (buffer_uptodate(bh))
2535
continue; /* reiserfs does this */
2536
if (block_start < from || block_end > to) {
2537
lock_buffer(bh);
2538
bh->b_end_io = end_buffer_read_nobh;
2539
submit_bh(READ, bh);
2540
nr_reads++;
2541
}
2542
}
2543
2544
if (nr_reads) {
2545
/*
2546
* The page is locked, so these buffers are protected from
2547
* any VM or truncate activity. Hence we don't need to care
2548
* for the buffer_head refcounts.
2549
*/
2550
for (bh = head; bh; bh = bh->b_this_page) {
2551
wait_on_buffer(bh);
2552
if (!buffer_uptodate(bh))
2553
ret = -EIO;
2554
}
2555
if (ret)
2556
goto failed;
2557
}
2558
2559
if (is_mapped_to_disk)
2560
SetPageMappedToDisk(page);
2561
2562
*fsdata = head; /* to be released by nobh_write_end */
2563
2564
return 0;
2565
2566
failed:
2567
BUG_ON(!ret);
2568
/*
2569
* Error recovery is a bit difficult. We need to zero out blocks that
2570
* were newly allocated, and dirty them to ensure they get written out.
2571
* Buffers need to be attached to the page at this point, otherwise
2572
* the handling of potential IO errors during writeout would be hard
2573
* (could try doing synchronous writeout, but what if that fails too?)
2574
*/
2575
attach_nobh_buffers(page, head);
2576
page_zero_new_buffers(page, from, to);
2577
2578
out_release:
2579
unlock_page(page);
2580
page_cache_release(page);
2581
*pagep = NULL;
2582
2583
return ret;
2584
}
2585
EXPORT_SYMBOL(nobh_write_begin);
2586
2587
int nobh_write_end(struct file *file, struct address_space *mapping,
2588
loff_t pos, unsigned len, unsigned copied,
2589
struct page *page, void *fsdata)
2590
{
2591
struct inode *inode = page->mapping->host;
2592
struct buffer_head *head = fsdata;
2593
struct buffer_head *bh;
2594
BUG_ON(fsdata != NULL && page_has_buffers(page));
2595
2596
if (unlikely(copied < len) && head)
2597
attach_nobh_buffers(page, head);
2598
if (page_has_buffers(page))
2599
return generic_write_end(file, mapping, pos, len,
2600
copied, page, fsdata);
2601
2602
SetPageUptodate(page);
2603
set_page_dirty(page);
2604
if (pos+copied > inode->i_size) {
2605
i_size_write(inode, pos+copied);
2606
mark_inode_dirty(inode);
2607
}
2608
2609
unlock_page(page);
2610
page_cache_release(page);
2611
2612
while (head) {
2613
bh = head;
2614
head = head->b_this_page;
2615
free_buffer_head(bh);
2616
}
2617
2618
return copied;
2619
}
2620
EXPORT_SYMBOL(nobh_write_end);
2621
2622
/*
2623
* nobh_writepage() - based on block_full_write_page() except
2624
* that it tries to operate without attaching bufferheads to
2625
* the page.
2626
*/
2627
int nobh_writepage(struct page *page, get_block_t *get_block,
2628
struct writeback_control *wbc)
2629
{
2630
struct inode * const inode = page->mapping->host;
2631
loff_t i_size = i_size_read(inode);
2632
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2633
unsigned offset;
2634
int ret;
2635
2636
/* Is the page fully inside i_size? */
2637
if (page->index < end_index)
2638
goto out;
2639
2640
/* Is the page fully outside i_size? (truncate in progress) */
2641
offset = i_size & (PAGE_CACHE_SIZE-1);
2642
if (page->index >= end_index+1 || !offset) {
2643
/*
2644
* The page may have dirty, unmapped buffers. For example,
2645
* they may have been added in ext3_writepage(). Make them
2646
* freeable here, so the page does not leak.
2647
*/
2648
#if 0
2649
/* Not really sure about this - do we need this ? */
2650
if (page->mapping->a_ops->invalidatepage)
2651
page->mapping->a_ops->invalidatepage(page, offset);
2652
#endif
2653
unlock_page(page);
2654
return 0; /* don't care */
2655
}
2656
2657
/*
2658
* The page straddles i_size. It must be zeroed out on each and every
2659
* writepage invocation because it may be mmapped. "A file is mapped
2660
* in multiples of the page size. For a file that is not a multiple of
2661
* the page size, the remaining memory is zeroed when mapped, and
2662
* writes to that region are not written out to the file."
2663
*/
2664
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2665
out:
2666
ret = mpage_writepage(page, get_block, wbc);
2667
if (ret == -EAGAIN)
2668
ret = __block_write_full_page(inode, page, get_block, wbc,
2669
end_buffer_async_write);
2670
return ret;
2671
}
2672
EXPORT_SYMBOL(nobh_writepage);
2673
2674
int nobh_truncate_page(struct address_space *mapping,
2675
loff_t from, get_block_t *get_block)
2676
{
2677
pgoff_t index = from >> PAGE_CACHE_SHIFT;
2678
unsigned offset = from & (PAGE_CACHE_SIZE-1);
2679
unsigned blocksize;
2680
sector_t iblock;
2681
unsigned length, pos;
2682
struct inode *inode = mapping->host;
2683
struct page *page;
2684
struct buffer_head map_bh;
2685
int err;
2686
2687
blocksize = 1 << inode->i_blkbits;
2688
length = offset & (blocksize - 1);
2689
2690
/* Block boundary? Nothing to do */
2691
if (!length)
2692
return 0;
2693
2694
length = blocksize - length;
2695
iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2696
2697
page = grab_cache_page(mapping, index);
2698
err = -ENOMEM;
2699
if (!page)
2700
goto out;
2701
2702
if (page_has_buffers(page)) {
2703
has_buffers:
2704
unlock_page(page);
2705
page_cache_release(page);
2706
return block_truncate_page(mapping, from, get_block);
2707
}
2708
2709
/* Find the buffer that contains "offset" */
2710
pos = blocksize;
2711
while (offset >= pos) {
2712
iblock++;
2713
pos += blocksize;
2714
}
2715
2716
map_bh.b_size = blocksize;
2717
map_bh.b_state = 0;
2718
err = get_block(inode, iblock, &map_bh, 0);
2719
if (err)
2720
goto unlock;
2721
/* unmapped? It's a hole - nothing to do */
2722
if (!buffer_mapped(&map_bh))
2723
goto unlock;
2724
2725
/* Ok, it's mapped. Make sure it's up-to-date */
2726
if (!PageUptodate(page)) {
2727
err = mapping->a_ops->readpage(NULL, page);
2728
if (err) {
2729
page_cache_release(page);
2730
goto out;
2731
}
2732
lock_page(page);
2733
if (!PageUptodate(page)) {
2734
err = -EIO;
2735
goto unlock;
2736
}
2737
if (page_has_buffers(page))
2738
goto has_buffers;
2739
}
2740
zero_user(page, offset, length);
2741
set_page_dirty(page);
2742
err = 0;
2743
2744
unlock:
2745
unlock_page(page);
2746
page_cache_release(page);
2747
out:
2748
return err;
2749
}
2750
EXPORT_SYMBOL(nobh_truncate_page);
2751
2752
int block_truncate_page(struct address_space *mapping,
2753
loff_t from, get_block_t *get_block)
2754
{
2755
pgoff_t index = from >> PAGE_CACHE_SHIFT;
2756
unsigned offset = from & (PAGE_CACHE_SIZE-1);
2757
unsigned blocksize;
2758
sector_t iblock;
2759
unsigned length, pos;
2760
struct inode *inode = mapping->host;
2761
struct page *page;
2762
struct buffer_head *bh;
2763
int err;
2764
2765
blocksize = 1 << inode->i_blkbits;
2766
length = offset & (blocksize - 1);
2767
2768
/* Block boundary? Nothing to do */
2769
if (!length)
2770
return 0;
2771
2772
length = blocksize - length;
2773
iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2774
2775
page = grab_cache_page(mapping, index);
2776
err = -ENOMEM;
2777
if (!page)
2778
goto out;
2779
2780
if (!page_has_buffers(page))
2781
create_empty_buffers(page, blocksize, 0);
2782
2783
/* Find the buffer that contains "offset" */
2784
bh = page_buffers(page);
2785
pos = blocksize;
2786
while (offset >= pos) {
2787
bh = bh->b_this_page;
2788
iblock++;
2789
pos += blocksize;
2790
}
2791
2792
err = 0;
2793
if (!buffer_mapped(bh)) {
2794
WARN_ON(bh->b_size != blocksize);
2795
err = get_block(inode, iblock, bh, 0);
2796
if (err)
2797
goto unlock;
2798
/* unmapped? It's a hole - nothing to do */
2799
if (!buffer_mapped(bh))
2800
goto unlock;
2801
}
2802
2803
/* Ok, it's mapped. Make sure it's up-to-date */
2804
if (PageUptodate(page))
2805
set_buffer_uptodate(bh);
2806
2807
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2808
err = -EIO;
2809
ll_rw_block(READ, 1, &bh);
2810
wait_on_buffer(bh);
2811
/* Uhhuh. Read error. Complain and punt. */
2812
if (!buffer_uptodate(bh))
2813
goto unlock;
2814
}
2815
2816
zero_user(page, offset, length);
2817
mark_buffer_dirty(bh);
2818
err = 0;
2819
2820
unlock:
2821
unlock_page(page);
2822
page_cache_release(page);
2823
out:
2824
return err;
2825
}
2826
EXPORT_SYMBOL(block_truncate_page);
2827
2828
/*
2829
* The generic ->writepage function for buffer-backed address_spaces
2830
* this form passes in the end_io handler used to finish the IO.
2831
*/
2832
int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2833
struct writeback_control *wbc, bh_end_io_t *handler)
2834
{
2835
struct inode * const inode = page->mapping->host;
2836
loff_t i_size = i_size_read(inode);
2837
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2838
unsigned offset;
2839
2840
/* Is the page fully inside i_size? */
2841
if (page->index < end_index)
2842
return __block_write_full_page(inode, page, get_block, wbc,
2843
handler);
2844
2845
/* Is the page fully outside i_size? (truncate in progress) */
2846
offset = i_size & (PAGE_CACHE_SIZE-1);
2847
if (page->index >= end_index+1 || !offset) {
2848
/*
2849
* The page may have dirty, unmapped buffers. For example,
2850
* they may have been added in ext3_writepage(). Make them
2851
* freeable here, so the page does not leak.
2852
*/
2853
do_invalidatepage(page, 0);
2854
unlock_page(page);
2855
return 0; /* don't care */
2856
}
2857
2858
/*
2859
* The page straddles i_size. It must be zeroed out on each and every
2860
* writepage invocation because it may be mmapped. "A file is mapped
2861
* in multiples of the page size. For a file that is not a multiple of
2862
* the page size, the remaining memory is zeroed when mapped, and
2863
* writes to that region are not written out to the file."
2864
*/
2865
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2866
return __block_write_full_page(inode, page, get_block, wbc, handler);
2867
}
2868
EXPORT_SYMBOL(block_write_full_page_endio);
2869
2870
/*
2871
* The generic ->writepage function for buffer-backed address_spaces
2872
*/
2873
int block_write_full_page(struct page *page, get_block_t *get_block,
2874
struct writeback_control *wbc)
2875
{
2876
return block_write_full_page_endio(page, get_block, wbc,
2877
end_buffer_async_write);
2878
}
2879
EXPORT_SYMBOL(block_write_full_page);
2880
2881
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2882
get_block_t *get_block)
2883
{
2884
struct buffer_head tmp;
2885
struct inode *inode = mapping->host;
2886
tmp.b_state = 0;
2887
tmp.b_blocknr = 0;
2888
tmp.b_size = 1 << inode->i_blkbits;
2889
get_block(inode, block, &tmp, 0);
2890
return tmp.b_blocknr;
2891
}
2892
EXPORT_SYMBOL(generic_block_bmap);
2893
2894
static void end_bio_bh_io_sync(struct bio *bio, int err)
2895
{
2896
struct buffer_head *bh = bio->bi_private;
2897
2898
if (err == -EOPNOTSUPP) {
2899
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2900
}
2901
2902
if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2903
set_bit(BH_Quiet, &bh->b_state);
2904
2905
bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2906
bio_put(bio);
2907
}
2908
2909
int submit_bh(int rw, struct buffer_head * bh)
2910
{
2911
struct bio *bio;
2912
int ret = 0;
2913
2914
BUG_ON(!buffer_locked(bh));
2915
BUG_ON(!buffer_mapped(bh));
2916
BUG_ON(!bh->b_end_io);
2917
BUG_ON(buffer_delay(bh));
2918
BUG_ON(buffer_unwritten(bh));
2919
2920
/*
2921
* Only clear out a write error when rewriting
2922
*/
2923
if (test_set_buffer_req(bh) && (rw & WRITE))
2924
clear_buffer_write_io_error(bh);
2925
2926
/*
2927
* from here on down, it's all bio -- do the initial mapping,
2928
* submit_bio -> generic_make_request may further map this bio around
2929
*/
2930
bio = bio_alloc(GFP_NOIO, 1);
2931
2932
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2933
bio->bi_bdev = bh->b_bdev;
2934
bio->bi_io_vec[0].bv_page = bh->b_page;
2935
bio->bi_io_vec[0].bv_len = bh->b_size;
2936
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2937
2938
bio->bi_vcnt = 1;
2939
bio->bi_idx = 0;
2940
bio->bi_size = bh->b_size;
2941
2942
bio->bi_end_io = end_bio_bh_io_sync;
2943
bio->bi_private = bh;
2944
2945
bio_get(bio);
2946
submit_bio(rw, bio);
2947
2948
if (bio_flagged(bio, BIO_EOPNOTSUPP))
2949
ret = -EOPNOTSUPP;
2950
2951
bio_put(bio);
2952
return ret;
2953
}
2954
EXPORT_SYMBOL(submit_bh);
2955
2956
/**
2957
* ll_rw_block: low-level access to block devices (DEPRECATED)
2958
* @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2959
* @nr: number of &struct buffer_heads in the array
2960
* @bhs: array of pointers to &struct buffer_head
2961
*
2962
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2963
* requests an I/O operation on them, either a %READ or a %WRITE. The third
2964
* %READA option is described in the documentation for generic_make_request()
2965
* which ll_rw_block() calls.
2966
*
2967
* This function drops any buffer that it cannot get a lock on (with the
2968
* BH_Lock state bit), any buffer that appears to be clean when doing a write
2969
* request, and any buffer that appears to be up-to-date when doing read
2970
* request. Further it marks as clean buffers that are processed for
2971
* writing (the buffer cache won't assume that they are actually clean
2972
* until the buffer gets unlocked).
2973
*
2974
* ll_rw_block sets b_end_io to simple completion handler that marks
2975
* the buffer up-to-date (if approriate), unlocks the buffer and wakes
2976
* any waiters.
2977
*
2978
* All of the buffers must be for the same device, and must also be a
2979
* multiple of the current approved size for the device.
2980
*/
2981
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2982
{
2983
int i;
2984
2985
for (i = 0; i < nr; i++) {
2986
struct buffer_head *bh = bhs[i];
2987
2988
if (!trylock_buffer(bh))
2989
continue;
2990
if (rw == WRITE) {
2991
if (test_clear_buffer_dirty(bh)) {
2992
bh->b_end_io = end_buffer_write_sync;
2993
get_bh(bh);
2994
submit_bh(WRITE, bh);
2995
continue;
2996
}
2997
} else {
2998
if (!buffer_uptodate(bh)) {
2999
bh->b_end_io = end_buffer_read_sync;
3000
get_bh(bh);
3001
submit_bh(rw, bh);
3002
continue;
3003
}
3004
}
3005
unlock_buffer(bh);
3006
}
3007
}
3008
EXPORT_SYMBOL(ll_rw_block);
3009
3010
void write_dirty_buffer(struct buffer_head *bh, int rw)
3011
{
3012
lock_buffer(bh);
3013
if (!test_clear_buffer_dirty(bh)) {
3014
unlock_buffer(bh);
3015
return;
3016
}
3017
bh->b_end_io = end_buffer_write_sync;
3018
get_bh(bh);
3019
submit_bh(rw, bh);
3020
}
3021
EXPORT_SYMBOL(write_dirty_buffer);
3022
3023
/*
3024
* For a data-integrity writeout, we need to wait upon any in-progress I/O
3025
* and then start new I/O and then wait upon it. The caller must have a ref on
3026
* the buffer_head.
3027
*/
3028
int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3029
{
3030
int ret = 0;
3031
3032
WARN_ON(atomic_read(&bh->b_count) < 1);
3033
lock_buffer(bh);
3034
if (test_clear_buffer_dirty(bh)) {
3035
get_bh(bh);
3036
bh->b_end_io = end_buffer_write_sync;
3037
ret = submit_bh(rw, bh);
3038
wait_on_buffer(bh);
3039
if (!ret && !buffer_uptodate(bh))
3040
ret = -EIO;
3041
} else {
3042
unlock_buffer(bh);
3043
}
3044
return ret;
3045
}
3046
EXPORT_SYMBOL(__sync_dirty_buffer);
3047
3048
int sync_dirty_buffer(struct buffer_head *bh)
3049
{
3050
return __sync_dirty_buffer(bh, WRITE_SYNC);
3051
}
3052
EXPORT_SYMBOL(sync_dirty_buffer);
3053
3054
/*
3055
* try_to_free_buffers() checks if all the buffers on this particular page
3056
* are unused, and releases them if so.
3057
*
3058
* Exclusion against try_to_free_buffers may be obtained by either
3059
* locking the page or by holding its mapping's private_lock.
3060
*
3061
* If the page is dirty but all the buffers are clean then we need to
3062
* be sure to mark the page clean as well. This is because the page
3063
* may be against a block device, and a later reattachment of buffers
3064
* to a dirty page will set *all* buffers dirty. Which would corrupt
3065
* filesystem data on the same device.
3066
*
3067
* The same applies to regular filesystem pages: if all the buffers are
3068
* clean then we set the page clean and proceed. To do that, we require
3069
* total exclusion from __set_page_dirty_buffers(). That is obtained with
3070
* private_lock.
3071
*
3072
* try_to_free_buffers() is non-blocking.
3073
*/
3074
static inline int buffer_busy(struct buffer_head *bh)
3075
{
3076
return atomic_read(&bh->b_count) |
3077
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3078
}
3079
3080
static int
3081
drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3082
{
3083
struct buffer_head *head = page_buffers(page);
3084
struct buffer_head *bh;
3085
3086
bh = head;
3087
do {
3088
if (buffer_write_io_error(bh) && page->mapping)
3089
set_bit(AS_EIO, &page->mapping->flags);
3090
if (buffer_busy(bh))
3091
goto failed;
3092
bh = bh->b_this_page;
3093
} while (bh != head);
3094
3095
do {
3096
struct buffer_head *next = bh->b_this_page;
3097
3098
if (bh->b_assoc_map)
3099
__remove_assoc_queue(bh);
3100
bh = next;
3101
} while (bh != head);
3102
*buffers_to_free = head;
3103
__clear_page_buffers(page);
3104
return 1;
3105
failed:
3106
return 0;
3107
}
3108
3109
int try_to_free_buffers(struct page *page)
3110
{
3111
struct address_space * const mapping = page->mapping;
3112
struct buffer_head *buffers_to_free = NULL;
3113
int ret = 0;
3114
3115
BUG_ON(!PageLocked(page));
3116
if (PageWriteback(page))
3117
return 0;
3118
3119
if (mapping == NULL) { /* can this still happen? */
3120
ret = drop_buffers(page, &buffers_to_free);
3121
goto out;
3122
}
3123
3124
spin_lock(&mapping->private_lock);
3125
ret = drop_buffers(page, &buffers_to_free);
3126
3127
/*
3128
* If the filesystem writes its buffers by hand (eg ext3)
3129
* then we can have clean buffers against a dirty page. We
3130
* clean the page here; otherwise the VM will never notice
3131
* that the filesystem did any IO at all.
3132
*
3133
* Also, during truncate, discard_buffer will have marked all
3134
* the page's buffers clean. We discover that here and clean
3135
* the page also.
3136
*
3137
* private_lock must be held over this entire operation in order
3138
* to synchronise against __set_page_dirty_buffers and prevent the
3139
* dirty bit from being lost.
3140
*/
3141
if (ret)
3142
cancel_dirty_page(page, PAGE_CACHE_SIZE);
3143
spin_unlock(&mapping->private_lock);
3144
out:
3145
if (buffers_to_free) {
3146
struct buffer_head *bh = buffers_to_free;
3147
3148
do {
3149
struct buffer_head *next = bh->b_this_page;
3150
free_buffer_head(bh);
3151
bh = next;
3152
} while (bh != buffers_to_free);
3153
}
3154
return ret;
3155
}
3156
EXPORT_SYMBOL(try_to_free_buffers);
3157
3158
/*
3159
* There are no bdflush tunables left. But distributions are
3160
* still running obsolete flush daemons, so we terminate them here.
3161
*
3162
* Use of bdflush() is deprecated and will be removed in a future kernel.
3163
* The `flush-X' kernel threads fully replace bdflush daemons and this call.
3164
*/
3165
SYSCALL_DEFINE2(bdflush, int, func, long, data)
3166
{
3167
static int msg_count;
3168
3169
if (!capable(CAP_SYS_ADMIN))
3170
return -EPERM;
3171
3172
if (msg_count < 5) {
3173
msg_count++;
3174
printk(KERN_INFO
3175
"warning: process `%s' used the obsolete bdflush"
3176
" system call\n", current->comm);
3177
printk(KERN_INFO "Fix your initscripts?\n");
3178
}
3179
3180
if (func == 1)
3181
do_exit(0);
3182
return 0;
3183
}
3184
3185
/*
3186
* Buffer-head allocation
3187
*/
3188
static struct kmem_cache *bh_cachep;
3189
3190
/*
3191
* Once the number of bh's in the machine exceeds this level, we start
3192
* stripping them in writeback.
3193
*/
3194
static int max_buffer_heads;
3195
3196
int buffer_heads_over_limit;
3197
3198
struct bh_accounting {
3199
int nr; /* Number of live bh's */
3200
int ratelimit; /* Limit cacheline bouncing */
3201
};
3202
3203
static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3204
3205
static void recalc_bh_state(void)
3206
{
3207
int i;
3208
int tot = 0;
3209
3210
if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3211
return;
3212
__this_cpu_write(bh_accounting.ratelimit, 0);
3213
for_each_online_cpu(i)
3214
tot += per_cpu(bh_accounting, i).nr;
3215
buffer_heads_over_limit = (tot > max_buffer_heads);
3216
}
3217
3218
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3219
{
3220
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3221
if (ret) {
3222
INIT_LIST_HEAD(&ret->b_assoc_buffers);
3223
preempt_disable();
3224
__this_cpu_inc(bh_accounting.nr);
3225
recalc_bh_state();
3226
preempt_enable();
3227
}
3228
return ret;
3229
}
3230
EXPORT_SYMBOL(alloc_buffer_head);
3231
3232
void free_buffer_head(struct buffer_head *bh)
3233
{
3234
BUG_ON(!list_empty(&bh->b_assoc_buffers));
3235
kmem_cache_free(bh_cachep, bh);
3236
preempt_disable();
3237
__this_cpu_dec(bh_accounting.nr);
3238
recalc_bh_state();
3239
preempt_enable();
3240
}
3241
EXPORT_SYMBOL(free_buffer_head);
3242
3243
static void buffer_exit_cpu(int cpu)
3244
{
3245
int i;
3246
struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3247
3248
for (i = 0; i < BH_LRU_SIZE; i++) {
3249
brelse(b->bhs[i]);
3250
b->bhs[i] = NULL;
3251
}
3252
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3253
per_cpu(bh_accounting, cpu).nr = 0;
3254
}
3255
3256
static int buffer_cpu_notify(struct notifier_block *self,
3257
unsigned long action, void *hcpu)
3258
{
3259
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3260
buffer_exit_cpu((unsigned long)hcpu);
3261
return NOTIFY_OK;
3262
}
3263
3264
/**
3265
* bh_uptodate_or_lock - Test whether the buffer is uptodate
3266
* @bh: struct buffer_head
3267
*
3268
* Return true if the buffer is up-to-date and false,
3269
* with the buffer locked, if not.
3270
*/
3271
int bh_uptodate_or_lock(struct buffer_head *bh)
3272
{
3273
if (!buffer_uptodate(bh)) {
3274
lock_buffer(bh);
3275
if (!buffer_uptodate(bh))
3276
return 0;
3277
unlock_buffer(bh);
3278
}
3279
return 1;
3280
}
3281
EXPORT_SYMBOL(bh_uptodate_or_lock);
3282
3283
/**
3284
* bh_submit_read - Submit a locked buffer for reading
3285
* @bh: struct buffer_head
3286
*
3287
* Returns zero on success and -EIO on error.
3288
*/
3289
int bh_submit_read(struct buffer_head *bh)
3290
{
3291
BUG_ON(!buffer_locked(bh));
3292
3293
if (buffer_uptodate(bh)) {
3294
unlock_buffer(bh);
3295
return 0;
3296
}
3297
3298
get_bh(bh);
3299
bh->b_end_io = end_buffer_read_sync;
3300
submit_bh(READ, bh);
3301
wait_on_buffer(bh);
3302
if (buffer_uptodate(bh))
3303
return 0;
3304
return -EIO;
3305
}
3306
EXPORT_SYMBOL(bh_submit_read);
3307
3308
void __init buffer_init(void)
3309
{
3310
int nrpages;
3311
3312
bh_cachep = kmem_cache_create("buffer_head",
3313
sizeof(struct buffer_head), 0,
3314
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3315
SLAB_MEM_SPREAD),
3316
NULL);
3317
3318
/*
3319
* Limit the bh occupancy to 10% of ZONE_NORMAL
3320
*/
3321
nrpages = (nr_free_buffer_pages() * 10) / 100;
3322
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3323
hotcpu_notifier(buffer_cpu_notify, 0);
3324
}
3325
3326