/*1* linux/fs/buffer.c2*3* Copyright (C) 1991, 1992, 2002 Linus Torvalds4*/56/*7* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/958*9* Removed a lot of unnecessary code and simplified things now that10* the buffer cache isn't our primary cache - Andrew Tridgell 12/9611*12* Speed up hash, lru, and free list operations. Use gfp() for allocating13* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM14*15* Added 32k buffer block sizes - these are required older ARM systems. - RMK16*17* async buffer flushing, 1999 Andrea Arcangeli <[email protected]>18*/1920#include <linux/kernel.h>21#include <linux/syscalls.h>22#include <linux/fs.h>23#include <linux/mm.h>24#include <linux/percpu.h>25#include <linux/slab.h>26#include <linux/capability.h>27#include <linux/blkdev.h>28#include <linux/file.h>29#include <linux/quotaops.h>30#include <linux/highmem.h>31#include <linux/module.h>32#include <linux/writeback.h>33#include <linux/hash.h>34#include <linux/suspend.h>35#include <linux/buffer_head.h>36#include <linux/task_io_accounting_ops.h>37#include <linux/bio.h>38#include <linux/notifier.h>39#include <linux/cpu.h>40#include <linux/bitops.h>41#include <linux/mpage.h>42#include <linux/bit_spinlock.h>43#include <linux/cleancache.h>4445static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);4647#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)4849inline void50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)51{52bh->b_end_io = handler;53bh->b_private = private;54}55EXPORT_SYMBOL(init_buffer);5657static int sleep_on_buffer(void *word)58{59io_schedule();60return 0;61}6263void __lock_buffer(struct buffer_head *bh)64{65wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,66TASK_UNINTERRUPTIBLE);67}68EXPORT_SYMBOL(__lock_buffer);6970void unlock_buffer(struct buffer_head *bh)71{72clear_bit_unlock(BH_Lock, &bh->b_state);73smp_mb__after_clear_bit();74wake_up_bit(&bh->b_state, BH_Lock);75}76EXPORT_SYMBOL(unlock_buffer);7778/*79* Block until a buffer comes unlocked. This doesn't stop it80* from becoming locked again - you have to lock it yourself81* if you want to preserve its state.82*/83void __wait_on_buffer(struct buffer_head * bh)84{85wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);86}87EXPORT_SYMBOL(__wait_on_buffer);8889static void90__clear_page_buffers(struct page *page)91{92ClearPagePrivate(page);93set_page_private(page, 0);94page_cache_release(page);95}969798static int quiet_error(struct buffer_head *bh)99{100if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())101return 0;102return 1;103}104105106static void buffer_io_error(struct buffer_head *bh)107{108char b[BDEVNAME_SIZE];109printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",110bdevname(bh->b_bdev, b),111(unsigned long long)bh->b_blocknr);112}113114/*115* End-of-IO handler helper function which does not touch the bh after116* unlocking it.117* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but118* a race there is benign: unlock_buffer() only use the bh's address for119* hashing after unlocking the buffer, so it doesn't actually touch the bh120* itself.121*/122static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)123{124if (uptodate) {125set_buffer_uptodate(bh);126} else {127/* This happens, due to failed READA attempts. */128clear_buffer_uptodate(bh);129}130unlock_buffer(bh);131}132133/*134* Default synchronous end-of-IO handler.. Just mark it up-to-date and135* unlock the buffer. This is what ll_rw_block uses too.136*/137void end_buffer_read_sync(struct buffer_head *bh, int uptodate)138{139__end_buffer_read_notouch(bh, uptodate);140put_bh(bh);141}142EXPORT_SYMBOL(end_buffer_read_sync);143144void end_buffer_write_sync(struct buffer_head *bh, int uptodate)145{146char b[BDEVNAME_SIZE];147148if (uptodate) {149set_buffer_uptodate(bh);150} else {151if (!quiet_error(bh)) {152buffer_io_error(bh);153printk(KERN_WARNING "lost page write due to "154"I/O error on %s\n",155bdevname(bh->b_bdev, b));156}157set_buffer_write_io_error(bh);158clear_buffer_uptodate(bh);159}160unlock_buffer(bh);161put_bh(bh);162}163EXPORT_SYMBOL(end_buffer_write_sync);164165/*166* Various filesystems appear to want __find_get_block to be non-blocking.167* But it's the page lock which protects the buffers. To get around this,168* we get exclusion from try_to_free_buffers with the blockdev mapping's169* private_lock.170*171* Hack idea: for the blockdev mapping, i_bufferlist_lock contention172* may be quite high. This code could TryLock the page, and if that173* succeeds, there is no need to take private_lock. (But if174* private_lock is contended then so is mapping->tree_lock).175*/176static struct buffer_head *177__find_get_block_slow(struct block_device *bdev, sector_t block)178{179struct inode *bd_inode = bdev->bd_inode;180struct address_space *bd_mapping = bd_inode->i_mapping;181struct buffer_head *ret = NULL;182pgoff_t index;183struct buffer_head *bh;184struct buffer_head *head;185struct page *page;186int all_mapped = 1;187188index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);189page = find_get_page(bd_mapping, index);190if (!page)191goto out;192193spin_lock(&bd_mapping->private_lock);194if (!page_has_buffers(page))195goto out_unlock;196head = page_buffers(page);197bh = head;198do {199if (!buffer_mapped(bh))200all_mapped = 0;201else if (bh->b_blocknr == block) {202ret = bh;203get_bh(bh);204goto out_unlock;205}206bh = bh->b_this_page;207} while (bh != head);208209/* we might be here because some of the buffers on this page are210* not mapped. This is due to various races between211* file io on the block device and getblk. It gets dealt with212* elsewhere, don't buffer_error if we had some unmapped buffers213*/214if (all_mapped) {215printk("__find_get_block_slow() failed. "216"block=%llu, b_blocknr=%llu\n",217(unsigned long long)block,218(unsigned long long)bh->b_blocknr);219printk("b_state=0x%08lx, b_size=%zu\n",220bh->b_state, bh->b_size);221printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);222}223out_unlock:224spin_unlock(&bd_mapping->private_lock);225page_cache_release(page);226out:227return ret;228}229230/* If invalidate_buffers() will trash dirty buffers, it means some kind231of fs corruption is going on. Trashing dirty data always imply losing232information that was supposed to be just stored on the physical layer233by the user.234235Thus invalidate_buffers in general usage is not allwowed to trash236dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to237be preserved. These buffers are simply skipped.238239We also skip buffers which are still in use. For example this can240happen if a userspace program is reading the block device.241242NOTE: In the case where the user removed a removable-media-disk even if243there's still dirty data not synced on disk (due a bug in the device driver244or due an error of the user), by not destroying the dirty buffers we could245generate corruption also on the next media inserted, thus a parameter is246necessary to handle this case in the most safe way possible (trying247to not corrupt also the new disk inserted with the data belonging to248the old now corrupted disk). Also for the ramdisk the natural thing249to do in order to release the ramdisk memory is to destroy dirty buffers.250251These are two special cases. Normal usage imply the device driver252to issue a sync on the device (without waiting I/O completion) and253then an invalidate_buffers call that doesn't trash dirty buffers.254255For handling cache coherency with the blkdev pagecache the 'update' case256is been introduced. It is needed to re-read from disk any pinned257buffer. NOTE: re-reading from disk is destructive so we can do it only258when we assume nobody is changing the buffercache under our I/O and when259we think the disk contains more recent information than the buffercache.260The update == 1 pass marks the buffers we need to update, the update == 2261pass does the actual I/O. */262void invalidate_bdev(struct block_device *bdev)263{264struct address_space *mapping = bdev->bd_inode->i_mapping;265266if (mapping->nrpages == 0)267return;268269invalidate_bh_lrus();270lru_add_drain_all(); /* make sure all lru add caches are flushed */271invalidate_mapping_pages(mapping, 0, -1);272/* 99% of the time, we don't need to flush the cleancache on the bdev.273* But, for the strange corners, lets be cautious274*/275cleancache_flush_inode(mapping);276}277EXPORT_SYMBOL(invalidate_bdev);278279/*280* Kick the writeback threads then try to free up some ZONE_NORMAL memory.281*/282static void free_more_memory(void)283{284struct zone *zone;285int nid;286287wakeup_flusher_threads(1024);288yield();289290for_each_online_node(nid) {291(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),292gfp_zone(GFP_NOFS), NULL,293&zone);294if (zone)295try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,296GFP_NOFS, NULL);297}298}299300/*301* I/O completion handler for block_read_full_page() - pages302* which come unlocked at the end of I/O.303*/304static void end_buffer_async_read(struct buffer_head *bh, int uptodate)305{306unsigned long flags;307struct buffer_head *first;308struct buffer_head *tmp;309struct page *page;310int page_uptodate = 1;311312BUG_ON(!buffer_async_read(bh));313314page = bh->b_page;315if (uptodate) {316set_buffer_uptodate(bh);317} else {318clear_buffer_uptodate(bh);319if (!quiet_error(bh))320buffer_io_error(bh);321SetPageError(page);322}323324/*325* Be _very_ careful from here on. Bad things can happen if326* two buffer heads end IO at almost the same time and both327* decide that the page is now completely done.328*/329first = page_buffers(page);330local_irq_save(flags);331bit_spin_lock(BH_Uptodate_Lock, &first->b_state);332clear_buffer_async_read(bh);333unlock_buffer(bh);334tmp = bh;335do {336if (!buffer_uptodate(tmp))337page_uptodate = 0;338if (buffer_async_read(tmp)) {339BUG_ON(!buffer_locked(tmp));340goto still_busy;341}342tmp = tmp->b_this_page;343} while (tmp != bh);344bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);345local_irq_restore(flags);346347/*348* If none of the buffers had errors and they are all349* uptodate then we can set the page uptodate.350*/351if (page_uptodate && !PageError(page))352SetPageUptodate(page);353unlock_page(page);354return;355356still_busy:357bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);358local_irq_restore(flags);359return;360}361362/*363* Completion handler for block_write_full_page() - pages which are unlocked364* during I/O, and which have PageWriteback cleared upon I/O completion.365*/366void end_buffer_async_write(struct buffer_head *bh, int uptodate)367{368char b[BDEVNAME_SIZE];369unsigned long flags;370struct buffer_head *first;371struct buffer_head *tmp;372struct page *page;373374BUG_ON(!buffer_async_write(bh));375376page = bh->b_page;377if (uptodate) {378set_buffer_uptodate(bh);379} else {380if (!quiet_error(bh)) {381buffer_io_error(bh);382printk(KERN_WARNING "lost page write due to "383"I/O error on %s\n",384bdevname(bh->b_bdev, b));385}386set_bit(AS_EIO, &page->mapping->flags);387set_buffer_write_io_error(bh);388clear_buffer_uptodate(bh);389SetPageError(page);390}391392first = page_buffers(page);393local_irq_save(flags);394bit_spin_lock(BH_Uptodate_Lock, &first->b_state);395396clear_buffer_async_write(bh);397unlock_buffer(bh);398tmp = bh->b_this_page;399while (tmp != bh) {400if (buffer_async_write(tmp)) {401BUG_ON(!buffer_locked(tmp));402goto still_busy;403}404tmp = tmp->b_this_page;405}406bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);407local_irq_restore(flags);408end_page_writeback(page);409return;410411still_busy:412bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);413local_irq_restore(flags);414return;415}416EXPORT_SYMBOL(end_buffer_async_write);417418/*419* If a page's buffers are under async readin (end_buffer_async_read420* completion) then there is a possibility that another thread of421* control could lock one of the buffers after it has completed422* but while some of the other buffers have not completed. This423* locked buffer would confuse end_buffer_async_read() into not unlocking424* the page. So the absence of BH_Async_Read tells end_buffer_async_read()425* that this buffer is not under async I/O.426*427* The page comes unlocked when it has no locked buffer_async buffers428* left.429*430* PageLocked prevents anyone starting new async I/O reads any of431* the buffers.432*433* PageWriteback is used to prevent simultaneous writeout of the same434* page.435*436* PageLocked prevents anyone from starting writeback of a page which is437* under read I/O (PageWriteback is only ever set against a locked page).438*/439static void mark_buffer_async_read(struct buffer_head *bh)440{441bh->b_end_io = end_buffer_async_read;442set_buffer_async_read(bh);443}444445static void mark_buffer_async_write_endio(struct buffer_head *bh,446bh_end_io_t *handler)447{448bh->b_end_io = handler;449set_buffer_async_write(bh);450}451452void mark_buffer_async_write(struct buffer_head *bh)453{454mark_buffer_async_write_endio(bh, end_buffer_async_write);455}456EXPORT_SYMBOL(mark_buffer_async_write);457458459/*460* fs/buffer.c contains helper functions for buffer-backed address space's461* fsync functions. A common requirement for buffer-based filesystems is462* that certain data from the backing blockdev needs to be written out for463* a successful fsync(). For example, ext2 indirect blocks need to be464* written back and waited upon before fsync() returns.465*466* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),467* inode_has_buffers() and invalidate_inode_buffers() are provided for the468* management of a list of dependent buffers at ->i_mapping->private_list.469*470* Locking is a little subtle: try_to_free_buffers() will remove buffers471* from their controlling inode's queue when they are being freed. But472* try_to_free_buffers() will be operating against the *blockdev* mapping473* at the time, not against the S_ISREG file which depends on those buffers.474* So the locking for private_list is via the private_lock in the address_space475* which backs the buffers. Which is different from the address_space476* against which the buffers are listed. So for a particular address_space,477* mapping->private_lock does *not* protect mapping->private_list! In fact,478* mapping->private_list will always be protected by the backing blockdev's479* ->private_lock.480*481* Which introduces a requirement: all buffers on an address_space's482* ->private_list must be from the same address_space: the blockdev's.483*484* address_spaces which do not place buffers at ->private_list via these485* utility functions are free to use private_lock and private_list for486* whatever they want. The only requirement is that list_empty(private_list)487* be true at clear_inode() time.488*489* FIXME: clear_inode should not call invalidate_inode_buffers(). The490* filesystems should do that. invalidate_inode_buffers() should just go491* BUG_ON(!list_empty).492*493* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should494* take an address_space, not an inode. And it should be called495* mark_buffer_dirty_fsync() to clearly define why those buffers are being496* queued up.497*498* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the499* list if it is already on a list. Because if the buffer is on a list,500* it *must* already be on the right one. If not, the filesystem is being501* silly. This will save a ton of locking. But first we have to ensure502* that buffers are taken *off* the old inode's list when they are freed503* (presumably in truncate). That requires careful auditing of all504* filesystems (do it inside bforget()). It could also be done by bringing505* b_inode back.506*/507508/*509* The buffer's backing address_space's private_lock must be held510*/511static void __remove_assoc_queue(struct buffer_head *bh)512{513list_del_init(&bh->b_assoc_buffers);514WARN_ON(!bh->b_assoc_map);515if (buffer_write_io_error(bh))516set_bit(AS_EIO, &bh->b_assoc_map->flags);517bh->b_assoc_map = NULL;518}519520int inode_has_buffers(struct inode *inode)521{522return !list_empty(&inode->i_data.private_list);523}524525/*526* osync is designed to support O_SYNC io. It waits synchronously for527* all already-submitted IO to complete, but does not queue any new528* writes to the disk.529*530* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as531* you dirty the buffers, and then use osync_inode_buffers to wait for532* completion. Any other dirty buffers which are not yet queued for533* write will not be flushed to disk by the osync.534*/535static int osync_buffers_list(spinlock_t *lock, struct list_head *list)536{537struct buffer_head *bh;538struct list_head *p;539int err = 0;540541spin_lock(lock);542repeat:543list_for_each_prev(p, list) {544bh = BH_ENTRY(p);545if (buffer_locked(bh)) {546get_bh(bh);547spin_unlock(lock);548wait_on_buffer(bh);549if (!buffer_uptodate(bh))550err = -EIO;551brelse(bh);552spin_lock(lock);553goto repeat;554}555}556spin_unlock(lock);557return err;558}559560static void do_thaw_one(struct super_block *sb, void *unused)561{562char b[BDEVNAME_SIZE];563while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))564printk(KERN_WARNING "Emergency Thaw on %s\n",565bdevname(sb->s_bdev, b));566}567568static void do_thaw_all(struct work_struct *work)569{570iterate_supers(do_thaw_one, NULL);571kfree(work);572printk(KERN_WARNING "Emergency Thaw complete\n");573}574575/**576* emergency_thaw_all -- forcibly thaw every frozen filesystem577*578* Used for emergency unfreeze of all filesystems via SysRq579*/580void emergency_thaw_all(void)581{582struct work_struct *work;583584work = kmalloc(sizeof(*work), GFP_ATOMIC);585if (work) {586INIT_WORK(work, do_thaw_all);587schedule_work(work);588}589}590591/**592* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers593* @mapping: the mapping which wants those buffers written594*595* Starts I/O against the buffers at mapping->private_list, and waits upon596* that I/O.597*598* Basically, this is a convenience function for fsync().599* @mapping is a file or directory which needs those buffers to be written for600* a successful fsync().601*/602int sync_mapping_buffers(struct address_space *mapping)603{604struct address_space *buffer_mapping = mapping->assoc_mapping;605606if (buffer_mapping == NULL || list_empty(&mapping->private_list))607return 0;608609return fsync_buffers_list(&buffer_mapping->private_lock,610&mapping->private_list);611}612EXPORT_SYMBOL(sync_mapping_buffers);613614/*615* Called when we've recently written block `bblock', and it is known that616* `bblock' was for a buffer_boundary() buffer. This means that the block at617* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's618* dirty, schedule it for IO. So that indirects merge nicely with their data.619*/620void write_boundary_block(struct block_device *bdev,621sector_t bblock, unsigned blocksize)622{623struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);624if (bh) {625if (buffer_dirty(bh))626ll_rw_block(WRITE, 1, &bh);627put_bh(bh);628}629}630631void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)632{633struct address_space *mapping = inode->i_mapping;634struct address_space *buffer_mapping = bh->b_page->mapping;635636mark_buffer_dirty(bh);637if (!mapping->assoc_mapping) {638mapping->assoc_mapping = buffer_mapping;639} else {640BUG_ON(mapping->assoc_mapping != buffer_mapping);641}642if (!bh->b_assoc_map) {643spin_lock(&buffer_mapping->private_lock);644list_move_tail(&bh->b_assoc_buffers,645&mapping->private_list);646bh->b_assoc_map = mapping;647spin_unlock(&buffer_mapping->private_lock);648}649}650EXPORT_SYMBOL(mark_buffer_dirty_inode);651652/*653* Mark the page dirty, and set it dirty in the radix tree, and mark the inode654* dirty.655*656* If warn is true, then emit a warning if the page is not uptodate and has657* not been truncated.658*/659static void __set_page_dirty(struct page *page,660struct address_space *mapping, int warn)661{662spin_lock_irq(&mapping->tree_lock);663if (page->mapping) { /* Race with truncate? */664WARN_ON_ONCE(warn && !PageUptodate(page));665account_page_dirtied(page, mapping);666radix_tree_tag_set(&mapping->page_tree,667page_index(page), PAGECACHE_TAG_DIRTY);668}669spin_unlock_irq(&mapping->tree_lock);670__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);671}672673/*674* Add a page to the dirty page list.675*676* It is a sad fact of life that this function is called from several places677* deeply under spinlocking. It may not sleep.678*679* If the page has buffers, the uptodate buffers are set dirty, to preserve680* dirty-state coherency between the page and the buffers. It the page does681* not have buffers then when they are later attached they will all be set682* dirty.683*684* The buffers are dirtied before the page is dirtied. There's a small race685* window in which a writepage caller may see the page cleanness but not the686* buffer dirtiness. That's fine. If this code were to set the page dirty687* before the buffers, a concurrent writepage caller could clear the page dirty688* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean689* page on the dirty page list.690*691* We use private_lock to lock against try_to_free_buffers while using the692* page's buffer list. Also use this to protect against clean buffers being693* added to the page after it was set dirty.694*695* FIXME: may need to call ->reservepage here as well. That's rather up to the696* address_space though.697*/698int __set_page_dirty_buffers(struct page *page)699{700int newly_dirty;701struct address_space *mapping = page_mapping(page);702703if (unlikely(!mapping))704return !TestSetPageDirty(page);705706spin_lock(&mapping->private_lock);707if (page_has_buffers(page)) {708struct buffer_head *head = page_buffers(page);709struct buffer_head *bh = head;710711do {712set_buffer_dirty(bh);713bh = bh->b_this_page;714} while (bh != head);715}716newly_dirty = !TestSetPageDirty(page);717spin_unlock(&mapping->private_lock);718719if (newly_dirty)720__set_page_dirty(page, mapping, 1);721return newly_dirty;722}723EXPORT_SYMBOL(__set_page_dirty_buffers);724725/*726* Write out and wait upon a list of buffers.727*728* We have conflicting pressures: we want to make sure that all729* initially dirty buffers get waited on, but that any subsequently730* dirtied buffers don't. After all, we don't want fsync to last731* forever if somebody is actively writing to the file.732*733* Do this in two main stages: first we copy dirty buffers to a734* temporary inode list, queueing the writes as we go. Then we clean735* up, waiting for those writes to complete.736*737* During this second stage, any subsequent updates to the file may end738* up refiling the buffer on the original inode's dirty list again, so739* there is a chance we will end up with a buffer queued for write but740* not yet completed on that list. So, as a final cleanup we go through741* the osync code to catch these locked, dirty buffers without requeuing742* any newly dirty buffers for write.743*/744static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)745{746struct buffer_head *bh;747struct list_head tmp;748struct address_space *mapping;749int err = 0, err2;750struct blk_plug plug;751752INIT_LIST_HEAD(&tmp);753blk_start_plug(&plug);754755spin_lock(lock);756while (!list_empty(list)) {757bh = BH_ENTRY(list->next);758mapping = bh->b_assoc_map;759__remove_assoc_queue(bh);760/* Avoid race with mark_buffer_dirty_inode() which does761* a lockless check and we rely on seeing the dirty bit */762smp_mb();763if (buffer_dirty(bh) || buffer_locked(bh)) {764list_add(&bh->b_assoc_buffers, &tmp);765bh->b_assoc_map = mapping;766if (buffer_dirty(bh)) {767get_bh(bh);768spin_unlock(lock);769/*770* Ensure any pending I/O completes so that771* write_dirty_buffer() actually writes the772* current contents - it is a noop if I/O is773* still in flight on potentially older774* contents.775*/776write_dirty_buffer(bh, WRITE_SYNC);777778/*779* Kick off IO for the previous mapping. Note780* that we will not run the very last mapping,781* wait_on_buffer() will do that for us782* through sync_buffer().783*/784brelse(bh);785spin_lock(lock);786}787}788}789790spin_unlock(lock);791blk_finish_plug(&plug);792spin_lock(lock);793794while (!list_empty(&tmp)) {795bh = BH_ENTRY(tmp.prev);796get_bh(bh);797mapping = bh->b_assoc_map;798__remove_assoc_queue(bh);799/* Avoid race with mark_buffer_dirty_inode() which does800* a lockless check and we rely on seeing the dirty bit */801smp_mb();802if (buffer_dirty(bh)) {803list_add(&bh->b_assoc_buffers,804&mapping->private_list);805bh->b_assoc_map = mapping;806}807spin_unlock(lock);808wait_on_buffer(bh);809if (!buffer_uptodate(bh))810err = -EIO;811brelse(bh);812spin_lock(lock);813}814815spin_unlock(lock);816err2 = osync_buffers_list(lock, list);817if (err)818return err;819else820return err2;821}822823/*824* Invalidate any and all dirty buffers on a given inode. We are825* probably unmounting the fs, but that doesn't mean we have already826* done a sync(). Just drop the buffers from the inode list.827*828* NOTE: we take the inode's blockdev's mapping's private_lock. Which829* assumes that all the buffers are against the blockdev. Not true830* for reiserfs.831*/832void invalidate_inode_buffers(struct inode *inode)833{834if (inode_has_buffers(inode)) {835struct address_space *mapping = &inode->i_data;836struct list_head *list = &mapping->private_list;837struct address_space *buffer_mapping = mapping->assoc_mapping;838839spin_lock(&buffer_mapping->private_lock);840while (!list_empty(list))841__remove_assoc_queue(BH_ENTRY(list->next));842spin_unlock(&buffer_mapping->private_lock);843}844}845EXPORT_SYMBOL(invalidate_inode_buffers);846847/*848* Remove any clean buffers from the inode's buffer list. This is called849* when we're trying to free the inode itself. Those buffers can pin it.850*851* Returns true if all buffers were removed.852*/853int remove_inode_buffers(struct inode *inode)854{855int ret = 1;856857if (inode_has_buffers(inode)) {858struct address_space *mapping = &inode->i_data;859struct list_head *list = &mapping->private_list;860struct address_space *buffer_mapping = mapping->assoc_mapping;861862spin_lock(&buffer_mapping->private_lock);863while (!list_empty(list)) {864struct buffer_head *bh = BH_ENTRY(list->next);865if (buffer_dirty(bh)) {866ret = 0;867break;868}869__remove_assoc_queue(bh);870}871spin_unlock(&buffer_mapping->private_lock);872}873return ret;874}875876/*877* Create the appropriate buffers when given a page for data area and878* the size of each buffer.. Use the bh->b_this_page linked list to879* follow the buffers created. Return NULL if unable to create more880* buffers.881*882* The retry flag is used to differentiate async IO (paging, swapping)883* which may not fail from ordinary buffer allocations.884*/885struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,886int retry)887{888struct buffer_head *bh, *head;889long offset;890891try_again:892head = NULL;893offset = PAGE_SIZE;894while ((offset -= size) >= 0) {895bh = alloc_buffer_head(GFP_NOFS);896if (!bh)897goto no_grow;898899bh->b_bdev = NULL;900bh->b_this_page = head;901bh->b_blocknr = -1;902head = bh;903904bh->b_state = 0;905atomic_set(&bh->b_count, 0);906bh->b_size = size;907908/* Link the buffer to its page */909set_bh_page(bh, page, offset);910911init_buffer(bh, NULL, NULL);912}913return head;914/*915* In case anything failed, we just free everything we got.916*/917no_grow:918if (head) {919do {920bh = head;921head = head->b_this_page;922free_buffer_head(bh);923} while (head);924}925926/*927* Return failure for non-async IO requests. Async IO requests928* are not allowed to fail, so we have to wait until buffer heads929* become available. But we don't want tasks sleeping with930* partially complete buffers, so all were released above.931*/932if (!retry)933return NULL;934935/* We're _really_ low on memory. Now we just936* wait for old buffer heads to become free due to937* finishing IO. Since this is an async request and938* the reserve list is empty, we're sure there are939* async buffer heads in use.940*/941free_more_memory();942goto try_again;943}944EXPORT_SYMBOL_GPL(alloc_page_buffers);945946static inline void947link_dev_buffers(struct page *page, struct buffer_head *head)948{949struct buffer_head *bh, *tail;950951bh = head;952do {953tail = bh;954bh = bh->b_this_page;955} while (bh);956tail->b_this_page = head;957attach_page_buffers(page, head);958}959960/*961* Initialise the state of a blockdev page's buffers.962*/963static void964init_page_buffers(struct page *page, struct block_device *bdev,965sector_t block, int size)966{967struct buffer_head *head = page_buffers(page);968struct buffer_head *bh = head;969int uptodate = PageUptodate(page);970971do {972if (!buffer_mapped(bh)) {973init_buffer(bh, NULL, NULL);974bh->b_bdev = bdev;975bh->b_blocknr = block;976if (uptodate)977set_buffer_uptodate(bh);978set_buffer_mapped(bh);979}980block++;981bh = bh->b_this_page;982} while (bh != head);983}984985/*986* Create the page-cache page that contains the requested block.987*988* This is user purely for blockdev mappings.989*/990static struct page *991grow_dev_page(struct block_device *bdev, sector_t block,992pgoff_t index, int size)993{994struct inode *inode = bdev->bd_inode;995struct page *page;996struct buffer_head *bh;997998page = find_or_create_page(inode->i_mapping, index,999(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);1000if (!page)1001return NULL;10021003BUG_ON(!PageLocked(page));10041005if (page_has_buffers(page)) {1006bh = page_buffers(page);1007if (bh->b_size == size) {1008init_page_buffers(page, bdev, block, size);1009return page;1010}1011if (!try_to_free_buffers(page))1012goto failed;1013}10141015/*1016* Allocate some buffers for this page1017*/1018bh = alloc_page_buffers(page, size, 0);1019if (!bh)1020goto failed;10211022/*1023* Link the page to the buffers and initialise them. Take the1024* lock to be atomic wrt __find_get_block(), which does not1025* run under the page lock.1026*/1027spin_lock(&inode->i_mapping->private_lock);1028link_dev_buffers(page, bh);1029init_page_buffers(page, bdev, block, size);1030spin_unlock(&inode->i_mapping->private_lock);1031return page;10321033failed:1034BUG();1035unlock_page(page);1036page_cache_release(page);1037return NULL;1038}10391040/*1041* Create buffers for the specified block device block's page. If1042* that page was dirty, the buffers are set dirty also.1043*/1044static int1045grow_buffers(struct block_device *bdev, sector_t block, int size)1046{1047struct page *page;1048pgoff_t index;1049int sizebits;10501051sizebits = -1;1052do {1053sizebits++;1054} while ((size << sizebits) < PAGE_SIZE);10551056index = block >> sizebits;10571058/*1059* Check for a block which wants to lie outside our maximum possible1060* pagecache index. (this comparison is done using sector_t types).1061*/1062if (unlikely(index != block >> sizebits)) {1063char b[BDEVNAME_SIZE];10641065printk(KERN_ERR "%s: requested out-of-range block %llu for "1066"device %s\n",1067__func__, (unsigned long long)block,1068bdevname(bdev, b));1069return -EIO;1070}1071block = index << sizebits;1072/* Create a page with the proper size buffers.. */1073page = grow_dev_page(bdev, block, index, size);1074if (!page)1075return 0;1076unlock_page(page);1077page_cache_release(page);1078return 1;1079}10801081static struct buffer_head *1082__getblk_slow(struct block_device *bdev, sector_t block, int size)1083{1084/* Size must be multiple of hard sectorsize */1085if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||1086(size < 512 || size > PAGE_SIZE))) {1087printk(KERN_ERR "getblk(): invalid block size %d requested\n",1088size);1089printk(KERN_ERR "logical block size: %d\n",1090bdev_logical_block_size(bdev));10911092dump_stack();1093return NULL;1094}10951096for (;;) {1097struct buffer_head * bh;1098int ret;10991100bh = __find_get_block(bdev, block, size);1101if (bh)1102return bh;11031104ret = grow_buffers(bdev, block, size);1105if (ret < 0)1106return NULL;1107if (ret == 0)1108free_more_memory();1109}1110}11111112/*1113* The relationship between dirty buffers and dirty pages:1114*1115* Whenever a page has any dirty buffers, the page's dirty bit is set, and1116* the page is tagged dirty in its radix tree.1117*1118* At all times, the dirtiness of the buffers represents the dirtiness of1119* subsections of the page. If the page has buffers, the page dirty bit is1120* merely a hint about the true dirty state.1121*1122* When a page is set dirty in its entirety, all its buffers are marked dirty1123* (if the page has buffers).1124*1125* When a buffer is marked dirty, its page is dirtied, but the page's other1126* buffers are not.1127*1128* Also. When blockdev buffers are explicitly read with bread(), they1129* individually become uptodate. But their backing page remains not1130* uptodate - even if all of its buffers are uptodate. A subsequent1131* block_read_full_page() against that page will discover all the uptodate1132* buffers, will set the page uptodate and will perform no I/O.1133*/11341135/**1136* mark_buffer_dirty - mark a buffer_head as needing writeout1137* @bh: the buffer_head to mark dirty1138*1139* mark_buffer_dirty() will set the dirty bit against the buffer, then set its1140* backing page dirty, then tag the page as dirty in its address_space's radix1141* tree and then attach the address_space's inode to its superblock's dirty1142* inode list.1143*1144* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,1145* mapping->tree_lock and mapping->host->i_lock.1146*/1147void mark_buffer_dirty(struct buffer_head *bh)1148{1149WARN_ON_ONCE(!buffer_uptodate(bh));11501151/*1152* Very *carefully* optimize the it-is-already-dirty case.1153*1154* Don't let the final "is it dirty" escape to before we1155* perhaps modified the buffer.1156*/1157if (buffer_dirty(bh)) {1158smp_mb();1159if (buffer_dirty(bh))1160return;1161}11621163if (!test_set_buffer_dirty(bh)) {1164struct page *page = bh->b_page;1165if (!TestSetPageDirty(page)) {1166struct address_space *mapping = page_mapping(page);1167if (mapping)1168__set_page_dirty(page, mapping, 0);1169}1170}1171}1172EXPORT_SYMBOL(mark_buffer_dirty);11731174/*1175* Decrement a buffer_head's reference count. If all buffers against a page1176* have zero reference count, are clean and unlocked, and if the page is clean1177* and unlocked then try_to_free_buffers() may strip the buffers from the page1178* in preparation for freeing it (sometimes, rarely, buffers are removed from1179* a page but it ends up not being freed, and buffers may later be reattached).1180*/1181void __brelse(struct buffer_head * buf)1182{1183if (atomic_read(&buf->b_count)) {1184put_bh(buf);1185return;1186}1187WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");1188}1189EXPORT_SYMBOL(__brelse);11901191/*1192* bforget() is like brelse(), except it discards any1193* potentially dirty data.1194*/1195void __bforget(struct buffer_head *bh)1196{1197clear_buffer_dirty(bh);1198if (bh->b_assoc_map) {1199struct address_space *buffer_mapping = bh->b_page->mapping;12001201spin_lock(&buffer_mapping->private_lock);1202list_del_init(&bh->b_assoc_buffers);1203bh->b_assoc_map = NULL;1204spin_unlock(&buffer_mapping->private_lock);1205}1206__brelse(bh);1207}1208EXPORT_SYMBOL(__bforget);12091210static struct buffer_head *__bread_slow(struct buffer_head *bh)1211{1212lock_buffer(bh);1213if (buffer_uptodate(bh)) {1214unlock_buffer(bh);1215return bh;1216} else {1217get_bh(bh);1218bh->b_end_io = end_buffer_read_sync;1219submit_bh(READ, bh);1220wait_on_buffer(bh);1221if (buffer_uptodate(bh))1222return bh;1223}1224brelse(bh);1225return NULL;1226}12271228/*1229* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().1230* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their1231* refcount elevated by one when they're in an LRU. A buffer can only appear1232* once in a particular CPU's LRU. A single buffer can be present in multiple1233* CPU's LRUs at the same time.1234*1235* This is a transparent caching front-end to sb_bread(), sb_getblk() and1236* sb_find_get_block().1237*1238* The LRUs themselves only need locking against invalidate_bh_lrus. We use1239* a local interrupt disable for that.1240*/12411242#define BH_LRU_SIZE 812431244struct bh_lru {1245struct buffer_head *bhs[BH_LRU_SIZE];1246};12471248static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};12491250#ifdef CONFIG_SMP1251#define bh_lru_lock() local_irq_disable()1252#define bh_lru_unlock() local_irq_enable()1253#else1254#define bh_lru_lock() preempt_disable()1255#define bh_lru_unlock() preempt_enable()1256#endif12571258static inline void check_irqs_on(void)1259{1260#ifdef irqs_disabled1261BUG_ON(irqs_disabled());1262#endif1263}12641265/*1266* The LRU management algorithm is dopey-but-simple. Sorry.1267*/1268static void bh_lru_install(struct buffer_head *bh)1269{1270struct buffer_head *evictee = NULL;12711272check_irqs_on();1273bh_lru_lock();1274if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {1275struct buffer_head *bhs[BH_LRU_SIZE];1276int in;1277int out = 0;12781279get_bh(bh);1280bhs[out++] = bh;1281for (in = 0; in < BH_LRU_SIZE; in++) {1282struct buffer_head *bh2 =1283__this_cpu_read(bh_lrus.bhs[in]);12841285if (bh2 == bh) {1286__brelse(bh2);1287} else {1288if (out >= BH_LRU_SIZE) {1289BUG_ON(evictee != NULL);1290evictee = bh2;1291} else {1292bhs[out++] = bh2;1293}1294}1295}1296while (out < BH_LRU_SIZE)1297bhs[out++] = NULL;1298memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));1299}1300bh_lru_unlock();13011302if (evictee)1303__brelse(evictee);1304}13051306/*1307* Look up the bh in this cpu's LRU. If it's there, move it to the head.1308*/1309static struct buffer_head *1310lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)1311{1312struct buffer_head *ret = NULL;1313unsigned int i;13141315check_irqs_on();1316bh_lru_lock();1317for (i = 0; i < BH_LRU_SIZE; i++) {1318struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);13191320if (bh && bh->b_bdev == bdev &&1321bh->b_blocknr == block && bh->b_size == size) {1322if (i) {1323while (i) {1324__this_cpu_write(bh_lrus.bhs[i],1325__this_cpu_read(bh_lrus.bhs[i - 1]));1326i--;1327}1328__this_cpu_write(bh_lrus.bhs[0], bh);1329}1330get_bh(bh);1331ret = bh;1332break;1333}1334}1335bh_lru_unlock();1336return ret;1337}13381339/*1340* Perform a pagecache lookup for the matching buffer. If it's there, refresh1341* it in the LRU and mark it as accessed. If it is not present then return1342* NULL1343*/1344struct buffer_head *1345__find_get_block(struct block_device *bdev, sector_t block, unsigned size)1346{1347struct buffer_head *bh = lookup_bh_lru(bdev, block, size);13481349if (bh == NULL) {1350bh = __find_get_block_slow(bdev, block);1351if (bh)1352bh_lru_install(bh);1353}1354if (bh)1355touch_buffer(bh);1356return bh;1357}1358EXPORT_SYMBOL(__find_get_block);13591360/*1361* __getblk will locate (and, if necessary, create) the buffer_head1362* which corresponds to the passed block_device, block and size. The1363* returned buffer has its reference count incremented.1364*1365* __getblk() cannot fail - it just keeps trying. If you pass it an1366* illegal block number, __getblk() will happily return a buffer_head1367* which represents the non-existent block. Very weird.1368*1369* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()1370* attempt is failing. FIXME, perhaps?1371*/1372struct buffer_head *1373__getblk(struct block_device *bdev, sector_t block, unsigned size)1374{1375struct buffer_head *bh = __find_get_block(bdev, block, size);13761377might_sleep();1378if (bh == NULL)1379bh = __getblk_slow(bdev, block, size);1380return bh;1381}1382EXPORT_SYMBOL(__getblk);13831384/*1385* Do async read-ahead on a buffer..1386*/1387void __breadahead(struct block_device *bdev, sector_t block, unsigned size)1388{1389struct buffer_head *bh = __getblk(bdev, block, size);1390if (likely(bh)) {1391ll_rw_block(READA, 1, &bh);1392brelse(bh);1393}1394}1395EXPORT_SYMBOL(__breadahead);13961397/**1398* __bread() - reads a specified block and returns the bh1399* @bdev: the block_device to read from1400* @block: number of block1401* @size: size (in bytes) to read1402*1403* Reads a specified block, and returns buffer head that contains it.1404* It returns NULL if the block was unreadable.1405*/1406struct buffer_head *1407__bread(struct block_device *bdev, sector_t block, unsigned size)1408{1409struct buffer_head *bh = __getblk(bdev, block, size);14101411if (likely(bh) && !buffer_uptodate(bh))1412bh = __bread_slow(bh);1413return bh;1414}1415EXPORT_SYMBOL(__bread);14161417/*1418* invalidate_bh_lrus() is called rarely - but not only at unmount.1419* This doesn't race because it runs in each cpu either in irq1420* or with preempt disabled.1421*/1422static void invalidate_bh_lru(void *arg)1423{1424struct bh_lru *b = &get_cpu_var(bh_lrus);1425int i;14261427for (i = 0; i < BH_LRU_SIZE; i++) {1428brelse(b->bhs[i]);1429b->bhs[i] = NULL;1430}1431put_cpu_var(bh_lrus);1432}14331434void invalidate_bh_lrus(void)1435{1436on_each_cpu(invalidate_bh_lru, NULL, 1);1437}1438EXPORT_SYMBOL_GPL(invalidate_bh_lrus);14391440void set_bh_page(struct buffer_head *bh,1441struct page *page, unsigned long offset)1442{1443bh->b_page = page;1444BUG_ON(offset >= PAGE_SIZE);1445if (PageHighMem(page))1446/*1447* This catches illegal uses and preserves the offset:1448*/1449bh->b_data = (char *)(0 + offset);1450else1451bh->b_data = page_address(page) + offset;1452}1453EXPORT_SYMBOL(set_bh_page);14541455/*1456* Called when truncating a buffer on a page completely.1457*/1458static void discard_buffer(struct buffer_head * bh)1459{1460lock_buffer(bh);1461clear_buffer_dirty(bh);1462bh->b_bdev = NULL;1463clear_buffer_mapped(bh);1464clear_buffer_req(bh);1465clear_buffer_new(bh);1466clear_buffer_delay(bh);1467clear_buffer_unwritten(bh);1468unlock_buffer(bh);1469}14701471/**1472* block_invalidatepage - invalidate part of all of a buffer-backed page1473*1474* @page: the page which is affected1475* @offset: the index of the truncation point1476*1477* block_invalidatepage() is called when all or part of the page has become1478* invalidatedby a truncate operation.1479*1480* block_invalidatepage() does not have to release all buffers, but it must1481* ensure that no dirty buffer is left outside @offset and that no I/O1482* is underway against any of the blocks which are outside the truncation1483* point. Because the caller is about to free (and possibly reuse) those1484* blocks on-disk.1485*/1486void block_invalidatepage(struct page *page, unsigned long offset)1487{1488struct buffer_head *head, *bh, *next;1489unsigned int curr_off = 0;14901491BUG_ON(!PageLocked(page));1492if (!page_has_buffers(page))1493goto out;14941495head = page_buffers(page);1496bh = head;1497do {1498unsigned int next_off = curr_off + bh->b_size;1499next = bh->b_this_page;15001501/*1502* is this block fully invalidated?1503*/1504if (offset <= curr_off)1505discard_buffer(bh);1506curr_off = next_off;1507bh = next;1508} while (bh != head);15091510/*1511* We release buffers only if the entire page is being invalidated.1512* The get_block cached value has been unconditionally invalidated,1513* so real IO is not possible anymore.1514*/1515if (offset == 0)1516try_to_release_page(page, 0);1517out:1518return;1519}1520EXPORT_SYMBOL(block_invalidatepage);15211522/*1523* We attach and possibly dirty the buffers atomically wrt1524* __set_page_dirty_buffers() via private_lock. try_to_free_buffers1525* is already excluded via the page lock.1526*/1527void create_empty_buffers(struct page *page,1528unsigned long blocksize, unsigned long b_state)1529{1530struct buffer_head *bh, *head, *tail;15311532head = alloc_page_buffers(page, blocksize, 1);1533bh = head;1534do {1535bh->b_state |= b_state;1536tail = bh;1537bh = bh->b_this_page;1538} while (bh);1539tail->b_this_page = head;15401541spin_lock(&page->mapping->private_lock);1542if (PageUptodate(page) || PageDirty(page)) {1543bh = head;1544do {1545if (PageDirty(page))1546set_buffer_dirty(bh);1547if (PageUptodate(page))1548set_buffer_uptodate(bh);1549bh = bh->b_this_page;1550} while (bh != head);1551}1552attach_page_buffers(page, head);1553spin_unlock(&page->mapping->private_lock);1554}1555EXPORT_SYMBOL(create_empty_buffers);15561557/*1558* We are taking a block for data and we don't want any output from any1559* buffer-cache aliases starting from return from that function and1560* until the moment when something will explicitly mark the buffer1561* dirty (hopefully that will not happen until we will free that block ;-)1562* We don't even need to mark it not-uptodate - nobody can expect1563* anything from a newly allocated buffer anyway. We used to used1564* unmap_buffer() for such invalidation, but that was wrong. We definitely1565* don't want to mark the alias unmapped, for example - it would confuse1566* anyone who might pick it with bread() afterwards...1567*1568* Also.. Note that bforget() doesn't lock the buffer. So there can1569* be writeout I/O going on against recently-freed buffers. We don't1570* wait on that I/O in bforget() - it's more efficient to wait on the I/O1571* only if we really need to. That happens here.1572*/1573void unmap_underlying_metadata(struct block_device *bdev, sector_t block)1574{1575struct buffer_head *old_bh;15761577might_sleep();15781579old_bh = __find_get_block_slow(bdev, block);1580if (old_bh) {1581clear_buffer_dirty(old_bh);1582wait_on_buffer(old_bh);1583clear_buffer_req(old_bh);1584__brelse(old_bh);1585}1586}1587EXPORT_SYMBOL(unmap_underlying_metadata);15881589/*1590* NOTE! All mapped/uptodate combinations are valid:1591*1592* Mapped Uptodate Meaning1593*1594* No No "unknown" - must do get_block()1595* No Yes "hole" - zero-filled1596* Yes No "allocated" - allocated on disk, not read in1597* Yes Yes "valid" - allocated and up-to-date in memory.1598*1599* "Dirty" is valid only with the last case (mapped+uptodate).1600*/16011602/*1603* While block_write_full_page is writing back the dirty buffers under1604* the page lock, whoever dirtied the buffers may decide to clean them1605* again at any time. We handle that by only looking at the buffer1606* state inside lock_buffer().1607*1608* If block_write_full_page() is called for regular writeback1609* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a1610* locked buffer. This only can happen if someone has written the buffer1611* directly, with submit_bh(). At the address_space level PageWriteback1612* prevents this contention from occurring.1613*1614* If block_write_full_page() is called with wbc->sync_mode ==1615* WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this1616* causes the writes to be flagged as synchronous writes.1617*/1618static int __block_write_full_page(struct inode *inode, struct page *page,1619get_block_t *get_block, struct writeback_control *wbc,1620bh_end_io_t *handler)1621{1622int err;1623sector_t block;1624sector_t last_block;1625struct buffer_head *bh, *head;1626const unsigned blocksize = 1 << inode->i_blkbits;1627int nr_underway = 0;1628int write_op = (wbc->sync_mode == WB_SYNC_ALL ?1629WRITE_SYNC : WRITE);16301631BUG_ON(!PageLocked(page));16321633last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;16341635if (!page_has_buffers(page)) {1636create_empty_buffers(page, blocksize,1637(1 << BH_Dirty)|(1 << BH_Uptodate));1638}16391640/*1641* Be very careful. We have no exclusion from __set_page_dirty_buffers1642* here, and the (potentially unmapped) buffers may become dirty at1643* any time. If a buffer becomes dirty here after we've inspected it1644* then we just miss that fact, and the page stays dirty.1645*1646* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;1647* handle that here by just cleaning them.1648*/16491650block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);1651head = page_buffers(page);1652bh = head;16531654/*1655* Get all the dirty buffers mapped to disk addresses and1656* handle any aliases from the underlying blockdev's mapping.1657*/1658do {1659if (block > last_block) {1660/*1661* mapped buffers outside i_size will occur, because1662* this page can be outside i_size when there is a1663* truncate in progress.1664*/1665/*1666* The buffer was zeroed by block_write_full_page()1667*/1668clear_buffer_dirty(bh);1669set_buffer_uptodate(bh);1670} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&1671buffer_dirty(bh)) {1672WARN_ON(bh->b_size != blocksize);1673err = get_block(inode, block, bh, 1);1674if (err)1675goto recover;1676clear_buffer_delay(bh);1677if (buffer_new(bh)) {1678/* blockdev mappings never come here */1679clear_buffer_new(bh);1680unmap_underlying_metadata(bh->b_bdev,1681bh->b_blocknr);1682}1683}1684bh = bh->b_this_page;1685block++;1686} while (bh != head);16871688do {1689if (!buffer_mapped(bh))1690continue;1691/*1692* If it's a fully non-blocking write attempt and we cannot1693* lock the buffer then redirty the page. Note that this can1694* potentially cause a busy-wait loop from writeback threads1695* and kswapd activity, but those code paths have their own1696* higher-level throttling.1697*/1698if (wbc->sync_mode != WB_SYNC_NONE) {1699lock_buffer(bh);1700} else if (!trylock_buffer(bh)) {1701redirty_page_for_writepage(wbc, page);1702continue;1703}1704if (test_clear_buffer_dirty(bh)) {1705mark_buffer_async_write_endio(bh, handler);1706} else {1707unlock_buffer(bh);1708}1709} while ((bh = bh->b_this_page) != head);17101711/*1712* The page and its buffers are protected by PageWriteback(), so we can1713* drop the bh refcounts early.1714*/1715BUG_ON(PageWriteback(page));1716set_page_writeback(page);17171718do {1719struct buffer_head *next = bh->b_this_page;1720if (buffer_async_write(bh)) {1721submit_bh(write_op, bh);1722nr_underway++;1723}1724bh = next;1725} while (bh != head);1726unlock_page(page);17271728err = 0;1729done:1730if (nr_underway == 0) {1731/*1732* The page was marked dirty, but the buffers were1733* clean. Someone wrote them back by hand with1734* ll_rw_block/submit_bh. A rare case.1735*/1736end_page_writeback(page);17371738/*1739* The page and buffer_heads can be released at any time from1740* here on.1741*/1742}1743return err;17441745recover:1746/*1747* ENOSPC, or some other error. We may already have added some1748* blocks to the file, so we need to write these out to avoid1749* exposing stale data.1750* The page is currently locked and not marked for writeback1751*/1752bh = head;1753/* Recovery: lock and submit the mapped buffers */1754do {1755if (buffer_mapped(bh) && buffer_dirty(bh) &&1756!buffer_delay(bh)) {1757lock_buffer(bh);1758mark_buffer_async_write_endio(bh, handler);1759} else {1760/*1761* The buffer may have been set dirty during1762* attachment to a dirty page.1763*/1764clear_buffer_dirty(bh);1765}1766} while ((bh = bh->b_this_page) != head);1767SetPageError(page);1768BUG_ON(PageWriteback(page));1769mapping_set_error(page->mapping, err);1770set_page_writeback(page);1771do {1772struct buffer_head *next = bh->b_this_page;1773if (buffer_async_write(bh)) {1774clear_buffer_dirty(bh);1775submit_bh(write_op, bh);1776nr_underway++;1777}1778bh = next;1779} while (bh != head);1780unlock_page(page);1781goto done;1782}17831784/*1785* If a page has any new buffers, zero them out here, and mark them uptodate1786* and dirty so they'll be written out (in order to prevent uninitialised1787* block data from leaking). And clear the new bit.1788*/1789void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)1790{1791unsigned int block_start, block_end;1792struct buffer_head *head, *bh;17931794BUG_ON(!PageLocked(page));1795if (!page_has_buffers(page))1796return;17971798bh = head = page_buffers(page);1799block_start = 0;1800do {1801block_end = block_start + bh->b_size;18021803if (buffer_new(bh)) {1804if (block_end > from && block_start < to) {1805if (!PageUptodate(page)) {1806unsigned start, size;18071808start = max(from, block_start);1809size = min(to, block_end) - start;18101811zero_user(page, start, size);1812set_buffer_uptodate(bh);1813}18141815clear_buffer_new(bh);1816mark_buffer_dirty(bh);1817}1818}18191820block_start = block_end;1821bh = bh->b_this_page;1822} while (bh != head);1823}1824EXPORT_SYMBOL(page_zero_new_buffers);18251826int __block_write_begin(struct page *page, loff_t pos, unsigned len,1827get_block_t *get_block)1828{1829unsigned from = pos & (PAGE_CACHE_SIZE - 1);1830unsigned to = from + len;1831struct inode *inode = page->mapping->host;1832unsigned block_start, block_end;1833sector_t block;1834int err = 0;1835unsigned blocksize, bbits;1836struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;18371838BUG_ON(!PageLocked(page));1839BUG_ON(from > PAGE_CACHE_SIZE);1840BUG_ON(to > PAGE_CACHE_SIZE);1841BUG_ON(from > to);18421843blocksize = 1 << inode->i_blkbits;1844if (!page_has_buffers(page))1845create_empty_buffers(page, blocksize, 0);1846head = page_buffers(page);18471848bbits = inode->i_blkbits;1849block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);18501851for(bh = head, block_start = 0; bh != head || !block_start;1852block++, block_start=block_end, bh = bh->b_this_page) {1853block_end = block_start + blocksize;1854if (block_end <= from || block_start >= to) {1855if (PageUptodate(page)) {1856if (!buffer_uptodate(bh))1857set_buffer_uptodate(bh);1858}1859continue;1860}1861if (buffer_new(bh))1862clear_buffer_new(bh);1863if (!buffer_mapped(bh)) {1864WARN_ON(bh->b_size != blocksize);1865err = get_block(inode, block, bh, 1);1866if (err)1867break;1868if (buffer_new(bh)) {1869unmap_underlying_metadata(bh->b_bdev,1870bh->b_blocknr);1871if (PageUptodate(page)) {1872clear_buffer_new(bh);1873set_buffer_uptodate(bh);1874mark_buffer_dirty(bh);1875continue;1876}1877if (block_end > to || block_start < from)1878zero_user_segments(page,1879to, block_end,1880block_start, from);1881continue;1882}1883}1884if (PageUptodate(page)) {1885if (!buffer_uptodate(bh))1886set_buffer_uptodate(bh);1887continue;1888}1889if (!buffer_uptodate(bh) && !buffer_delay(bh) &&1890!buffer_unwritten(bh) &&1891(block_start < from || block_end > to)) {1892ll_rw_block(READ, 1, &bh);1893*wait_bh++=bh;1894}1895}1896/*1897* If we issued read requests - let them complete.1898*/1899while(wait_bh > wait) {1900wait_on_buffer(*--wait_bh);1901if (!buffer_uptodate(*wait_bh))1902err = -EIO;1903}1904if (unlikely(err))1905page_zero_new_buffers(page, from, to);1906return err;1907}1908EXPORT_SYMBOL(__block_write_begin);19091910static int __block_commit_write(struct inode *inode, struct page *page,1911unsigned from, unsigned to)1912{1913unsigned block_start, block_end;1914int partial = 0;1915unsigned blocksize;1916struct buffer_head *bh, *head;19171918blocksize = 1 << inode->i_blkbits;19191920for(bh = head = page_buffers(page), block_start = 0;1921bh != head || !block_start;1922block_start=block_end, bh = bh->b_this_page) {1923block_end = block_start + blocksize;1924if (block_end <= from || block_start >= to) {1925if (!buffer_uptodate(bh))1926partial = 1;1927} else {1928set_buffer_uptodate(bh);1929mark_buffer_dirty(bh);1930}1931clear_buffer_new(bh);1932}19331934/*1935* If this is a partial write which happened to make all buffers1936* uptodate then we can optimize away a bogus readpage() for1937* the next read(). Here we 'discover' whether the page went1938* uptodate as a result of this (potentially partial) write.1939*/1940if (!partial)1941SetPageUptodate(page);1942return 0;1943}19441945/*1946* block_write_begin takes care of the basic task of block allocation and1947* bringing partial write blocks uptodate first.1948*1949* The filesystem needs to handle block truncation upon failure.1950*/1951int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,1952unsigned flags, struct page **pagep, get_block_t *get_block)1953{1954pgoff_t index = pos >> PAGE_CACHE_SHIFT;1955struct page *page;1956int status;19571958page = grab_cache_page_write_begin(mapping, index, flags);1959if (!page)1960return -ENOMEM;19611962status = __block_write_begin(page, pos, len, get_block);1963if (unlikely(status)) {1964unlock_page(page);1965page_cache_release(page);1966page = NULL;1967}19681969*pagep = page;1970return status;1971}1972EXPORT_SYMBOL(block_write_begin);19731974int block_write_end(struct file *file, struct address_space *mapping,1975loff_t pos, unsigned len, unsigned copied,1976struct page *page, void *fsdata)1977{1978struct inode *inode = mapping->host;1979unsigned start;19801981start = pos & (PAGE_CACHE_SIZE - 1);19821983if (unlikely(copied < len)) {1984/*1985* The buffers that were written will now be uptodate, so we1986* don't have to worry about a readpage reading them and1987* overwriting a partial write. However if we have encountered1988* a short write and only partially written into a buffer, it1989* will not be marked uptodate, so a readpage might come in and1990* destroy our partial write.1991*1992* Do the simplest thing, and just treat any short write to a1993* non uptodate page as a zero-length write, and force the1994* caller to redo the whole thing.1995*/1996if (!PageUptodate(page))1997copied = 0;19981999page_zero_new_buffers(page, start+copied, start+len);2000}2001flush_dcache_page(page);20022003/* This could be a short (even 0-length) commit */2004__block_commit_write(inode, page, start, start+copied);20052006return copied;2007}2008EXPORT_SYMBOL(block_write_end);20092010int generic_write_end(struct file *file, struct address_space *mapping,2011loff_t pos, unsigned len, unsigned copied,2012struct page *page, void *fsdata)2013{2014struct inode *inode = mapping->host;2015int i_size_changed = 0;20162017copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);20182019/*2020* No need to use i_size_read() here, the i_size2021* cannot change under us because we hold i_mutex.2022*2023* But it's important to update i_size while still holding page lock:2024* page writeout could otherwise come in and zero beyond i_size.2025*/2026if (pos+copied > inode->i_size) {2027i_size_write(inode, pos+copied);2028i_size_changed = 1;2029}20302031unlock_page(page);2032page_cache_release(page);20332034/*2035* Don't mark the inode dirty under page lock. First, it unnecessarily2036* makes the holding time of page lock longer. Second, it forces lock2037* ordering of page lock and transaction start for journaling2038* filesystems.2039*/2040if (i_size_changed)2041mark_inode_dirty(inode);20422043return copied;2044}2045EXPORT_SYMBOL(generic_write_end);20462047/*2048* block_is_partially_uptodate checks whether buffers within a page are2049* uptodate or not.2050*2051* Returns true if all buffers which correspond to a file portion2052* we want to read are uptodate.2053*/2054int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,2055unsigned long from)2056{2057struct inode *inode = page->mapping->host;2058unsigned block_start, block_end, blocksize;2059unsigned to;2060struct buffer_head *bh, *head;2061int ret = 1;20622063if (!page_has_buffers(page))2064return 0;20652066blocksize = 1 << inode->i_blkbits;2067to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);2068to = from + to;2069if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)2070return 0;20712072head = page_buffers(page);2073bh = head;2074block_start = 0;2075do {2076block_end = block_start + blocksize;2077if (block_end > from && block_start < to) {2078if (!buffer_uptodate(bh)) {2079ret = 0;2080break;2081}2082if (block_end >= to)2083break;2084}2085block_start = block_end;2086bh = bh->b_this_page;2087} while (bh != head);20882089return ret;2090}2091EXPORT_SYMBOL(block_is_partially_uptodate);20922093/*2094* Generic "read page" function for block devices that have the normal2095* get_block functionality. This is most of the block device filesystems.2096* Reads the page asynchronously --- the unlock_buffer() and2097* set/clear_buffer_uptodate() functions propagate buffer state into the2098* page struct once IO has completed.2099*/2100int block_read_full_page(struct page *page, get_block_t *get_block)2101{2102struct inode *inode = page->mapping->host;2103sector_t iblock, lblock;2104struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];2105unsigned int blocksize;2106int nr, i;2107int fully_mapped = 1;21082109BUG_ON(!PageLocked(page));2110blocksize = 1 << inode->i_blkbits;2111if (!page_has_buffers(page))2112create_empty_buffers(page, blocksize, 0);2113head = page_buffers(page);21142115iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);2116lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;2117bh = head;2118nr = 0;2119i = 0;21202121do {2122if (buffer_uptodate(bh))2123continue;21242125if (!buffer_mapped(bh)) {2126int err = 0;21272128fully_mapped = 0;2129if (iblock < lblock) {2130WARN_ON(bh->b_size != blocksize);2131err = get_block(inode, iblock, bh, 0);2132if (err)2133SetPageError(page);2134}2135if (!buffer_mapped(bh)) {2136zero_user(page, i * blocksize, blocksize);2137if (!err)2138set_buffer_uptodate(bh);2139continue;2140}2141/*2142* get_block() might have updated the buffer2143* synchronously2144*/2145if (buffer_uptodate(bh))2146continue;2147}2148arr[nr++] = bh;2149} while (i++, iblock++, (bh = bh->b_this_page) != head);21502151if (fully_mapped)2152SetPageMappedToDisk(page);21532154if (!nr) {2155/*2156* All buffers are uptodate - we can set the page uptodate2157* as well. But not if get_block() returned an error.2158*/2159if (!PageError(page))2160SetPageUptodate(page);2161unlock_page(page);2162return 0;2163}21642165/* Stage two: lock the buffers */2166for (i = 0; i < nr; i++) {2167bh = arr[i];2168lock_buffer(bh);2169mark_buffer_async_read(bh);2170}21712172/*2173* Stage 3: start the IO. Check for uptodateness2174* inside the buffer lock in case another process reading2175* the underlying blockdev brought it uptodate (the sct fix).2176*/2177for (i = 0; i < nr; i++) {2178bh = arr[i];2179if (buffer_uptodate(bh))2180end_buffer_async_read(bh, 1);2181else2182submit_bh(READ, bh);2183}2184return 0;2185}2186EXPORT_SYMBOL(block_read_full_page);21872188/* utility function for filesystems that need to do work on expanding2189* truncates. Uses filesystem pagecache writes to allow the filesystem to2190* deal with the hole.2191*/2192int generic_cont_expand_simple(struct inode *inode, loff_t size)2193{2194struct address_space *mapping = inode->i_mapping;2195struct page *page;2196void *fsdata;2197int err;21982199err = inode_newsize_ok(inode, size);2200if (err)2201goto out;22022203err = pagecache_write_begin(NULL, mapping, size, 0,2204AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,2205&page, &fsdata);2206if (err)2207goto out;22082209err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);2210BUG_ON(err > 0);22112212out:2213return err;2214}2215EXPORT_SYMBOL(generic_cont_expand_simple);22162217static int cont_expand_zero(struct file *file, struct address_space *mapping,2218loff_t pos, loff_t *bytes)2219{2220struct inode *inode = mapping->host;2221unsigned blocksize = 1 << inode->i_blkbits;2222struct page *page;2223void *fsdata;2224pgoff_t index, curidx;2225loff_t curpos;2226unsigned zerofrom, offset, len;2227int err = 0;22282229index = pos >> PAGE_CACHE_SHIFT;2230offset = pos & ~PAGE_CACHE_MASK;22312232while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {2233zerofrom = curpos & ~PAGE_CACHE_MASK;2234if (zerofrom & (blocksize-1)) {2235*bytes |= (blocksize-1);2236(*bytes)++;2237}2238len = PAGE_CACHE_SIZE - zerofrom;22392240err = pagecache_write_begin(file, mapping, curpos, len,2241AOP_FLAG_UNINTERRUPTIBLE,2242&page, &fsdata);2243if (err)2244goto out;2245zero_user(page, zerofrom, len);2246err = pagecache_write_end(file, mapping, curpos, len, len,2247page, fsdata);2248if (err < 0)2249goto out;2250BUG_ON(err != len);2251err = 0;22522253balance_dirty_pages_ratelimited(mapping);2254}22552256/* page covers the boundary, find the boundary offset */2257if (index == curidx) {2258zerofrom = curpos & ~PAGE_CACHE_MASK;2259/* if we will expand the thing last block will be filled */2260if (offset <= zerofrom) {2261goto out;2262}2263if (zerofrom & (blocksize-1)) {2264*bytes |= (blocksize-1);2265(*bytes)++;2266}2267len = offset - zerofrom;22682269err = pagecache_write_begin(file, mapping, curpos, len,2270AOP_FLAG_UNINTERRUPTIBLE,2271&page, &fsdata);2272if (err)2273goto out;2274zero_user(page, zerofrom, len);2275err = pagecache_write_end(file, mapping, curpos, len, len,2276page, fsdata);2277if (err < 0)2278goto out;2279BUG_ON(err != len);2280err = 0;2281}2282out:2283return err;2284}22852286/*2287* For moronic filesystems that do not allow holes in file.2288* We may have to extend the file.2289*/2290int cont_write_begin(struct file *file, struct address_space *mapping,2291loff_t pos, unsigned len, unsigned flags,2292struct page **pagep, void **fsdata,2293get_block_t *get_block, loff_t *bytes)2294{2295struct inode *inode = mapping->host;2296unsigned blocksize = 1 << inode->i_blkbits;2297unsigned zerofrom;2298int err;22992300err = cont_expand_zero(file, mapping, pos, bytes);2301if (err)2302return err;23032304zerofrom = *bytes & ~PAGE_CACHE_MASK;2305if (pos+len > *bytes && zerofrom & (blocksize-1)) {2306*bytes |= (blocksize-1);2307(*bytes)++;2308}23092310return block_write_begin(mapping, pos, len, flags, pagep, get_block);2311}2312EXPORT_SYMBOL(cont_write_begin);23132314int block_commit_write(struct page *page, unsigned from, unsigned to)2315{2316struct inode *inode = page->mapping->host;2317__block_commit_write(inode,page,from,to);2318return 0;2319}2320EXPORT_SYMBOL(block_commit_write);23212322/*2323* block_page_mkwrite() is not allowed to change the file size as it gets2324* called from a page fault handler when a page is first dirtied. Hence we must2325* be careful to check for EOF conditions here. We set the page up correctly2326* for a written page which means we get ENOSPC checking when writing into2327* holes and correct delalloc and unwritten extent mapping on filesystems that2328* support these features.2329*2330* We are not allowed to take the i_mutex here so we have to play games to2331* protect against truncate races as the page could now be beyond EOF. Because2332* truncate writes the inode size before removing pages, once we have the2333* page lock we can determine safely if the page is beyond EOF. If it is not2334* beyond EOF, then the page is guaranteed safe against truncation until we2335* unlock the page.2336*2337* Direct callers of this function should call vfs_check_frozen() so that page2338* fault does not busyloop until the fs is thawed.2339*/2340int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,2341get_block_t get_block)2342{2343struct page *page = vmf->page;2344struct inode *inode = vma->vm_file->f_path.dentry->d_inode;2345unsigned long end;2346loff_t size;2347int ret;23482349lock_page(page);2350size = i_size_read(inode);2351if ((page->mapping != inode->i_mapping) ||2352(page_offset(page) > size)) {2353/* We overload EFAULT to mean page got truncated */2354ret = -EFAULT;2355goto out_unlock;2356}23572358/* page is wholly or partially inside EOF */2359if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)2360end = size & ~PAGE_CACHE_MASK;2361else2362end = PAGE_CACHE_SIZE;23632364ret = __block_write_begin(page, 0, end, get_block);2365if (!ret)2366ret = block_commit_write(page, 0, end);23672368if (unlikely(ret < 0))2369goto out_unlock;2370/*2371* Freezing in progress? We check after the page is marked dirty and2372* with page lock held so if the test here fails, we are sure freezing2373* code will wait during syncing until the page fault is done - at that2374* point page will be dirty and unlocked so freezing code will write it2375* and writeprotect it again.2376*/2377set_page_dirty(page);2378if (inode->i_sb->s_frozen != SB_UNFROZEN) {2379ret = -EAGAIN;2380goto out_unlock;2381}2382wait_on_page_writeback(page);2383return 0;2384out_unlock:2385unlock_page(page);2386return ret;2387}2388EXPORT_SYMBOL(__block_page_mkwrite);23892390int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,2391get_block_t get_block)2392{2393int ret;2394struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;23952396/*2397* This check is racy but catches the common case. The check in2398* __block_page_mkwrite() is reliable.2399*/2400vfs_check_frozen(sb, SB_FREEZE_WRITE);2401ret = __block_page_mkwrite(vma, vmf, get_block);2402return block_page_mkwrite_return(ret);2403}2404EXPORT_SYMBOL(block_page_mkwrite);24052406/*2407* nobh_write_begin()'s prereads are special: the buffer_heads are freed2408* immediately, while under the page lock. So it needs a special end_io2409* handler which does not touch the bh after unlocking it.2410*/2411static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)2412{2413__end_buffer_read_notouch(bh, uptodate);2414}24152416/*2417* Attach the singly-linked list of buffers created by nobh_write_begin, to2418* the page (converting it to circular linked list and taking care of page2419* dirty races).2420*/2421static void attach_nobh_buffers(struct page *page, struct buffer_head *head)2422{2423struct buffer_head *bh;24242425BUG_ON(!PageLocked(page));24262427spin_lock(&page->mapping->private_lock);2428bh = head;2429do {2430if (PageDirty(page))2431set_buffer_dirty(bh);2432if (!bh->b_this_page)2433bh->b_this_page = head;2434bh = bh->b_this_page;2435} while (bh != head);2436attach_page_buffers(page, head);2437spin_unlock(&page->mapping->private_lock);2438}24392440/*2441* On entry, the page is fully not uptodate.2442* On exit the page is fully uptodate in the areas outside (from,to)2443* The filesystem needs to handle block truncation upon failure.2444*/2445int nobh_write_begin(struct address_space *mapping,2446loff_t pos, unsigned len, unsigned flags,2447struct page **pagep, void **fsdata,2448get_block_t *get_block)2449{2450struct inode *inode = mapping->host;2451const unsigned blkbits = inode->i_blkbits;2452const unsigned blocksize = 1 << blkbits;2453struct buffer_head *head, *bh;2454struct page *page;2455pgoff_t index;2456unsigned from, to;2457unsigned block_in_page;2458unsigned block_start, block_end;2459sector_t block_in_file;2460int nr_reads = 0;2461int ret = 0;2462int is_mapped_to_disk = 1;24632464index = pos >> PAGE_CACHE_SHIFT;2465from = pos & (PAGE_CACHE_SIZE - 1);2466to = from + len;24672468page = grab_cache_page_write_begin(mapping, index, flags);2469if (!page)2470return -ENOMEM;2471*pagep = page;2472*fsdata = NULL;24732474if (page_has_buffers(page)) {2475ret = __block_write_begin(page, pos, len, get_block);2476if (unlikely(ret))2477goto out_release;2478return ret;2479}24802481if (PageMappedToDisk(page))2482return 0;24832484/*2485* Allocate buffers so that we can keep track of state, and potentially2486* attach them to the page if an error occurs. In the common case of2487* no error, they will just be freed again without ever being attached2488* to the page (which is all OK, because we're under the page lock).2489*2490* Be careful: the buffer linked list is a NULL terminated one, rather2491* than the circular one we're used to.2492*/2493head = alloc_page_buffers(page, blocksize, 0);2494if (!head) {2495ret = -ENOMEM;2496goto out_release;2497}24982499block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);25002501/*2502* We loop across all blocks in the page, whether or not they are2503* part of the affected region. This is so we can discover if the2504* page is fully mapped-to-disk.2505*/2506for (block_start = 0, block_in_page = 0, bh = head;2507block_start < PAGE_CACHE_SIZE;2508block_in_page++, block_start += blocksize, bh = bh->b_this_page) {2509int create;25102511block_end = block_start + blocksize;2512bh->b_state = 0;2513create = 1;2514if (block_start >= to)2515create = 0;2516ret = get_block(inode, block_in_file + block_in_page,2517bh, create);2518if (ret)2519goto failed;2520if (!buffer_mapped(bh))2521is_mapped_to_disk = 0;2522if (buffer_new(bh))2523unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);2524if (PageUptodate(page)) {2525set_buffer_uptodate(bh);2526continue;2527}2528if (buffer_new(bh) || !buffer_mapped(bh)) {2529zero_user_segments(page, block_start, from,2530to, block_end);2531continue;2532}2533if (buffer_uptodate(bh))2534continue; /* reiserfs does this */2535if (block_start < from || block_end > to) {2536lock_buffer(bh);2537bh->b_end_io = end_buffer_read_nobh;2538submit_bh(READ, bh);2539nr_reads++;2540}2541}25422543if (nr_reads) {2544/*2545* The page is locked, so these buffers are protected from2546* any VM or truncate activity. Hence we don't need to care2547* for the buffer_head refcounts.2548*/2549for (bh = head; bh; bh = bh->b_this_page) {2550wait_on_buffer(bh);2551if (!buffer_uptodate(bh))2552ret = -EIO;2553}2554if (ret)2555goto failed;2556}25572558if (is_mapped_to_disk)2559SetPageMappedToDisk(page);25602561*fsdata = head; /* to be released by nobh_write_end */25622563return 0;25642565failed:2566BUG_ON(!ret);2567/*2568* Error recovery is a bit difficult. We need to zero out blocks that2569* were newly allocated, and dirty them to ensure they get written out.2570* Buffers need to be attached to the page at this point, otherwise2571* the handling of potential IO errors during writeout would be hard2572* (could try doing synchronous writeout, but what if that fails too?)2573*/2574attach_nobh_buffers(page, head);2575page_zero_new_buffers(page, from, to);25762577out_release:2578unlock_page(page);2579page_cache_release(page);2580*pagep = NULL;25812582return ret;2583}2584EXPORT_SYMBOL(nobh_write_begin);25852586int nobh_write_end(struct file *file, struct address_space *mapping,2587loff_t pos, unsigned len, unsigned copied,2588struct page *page, void *fsdata)2589{2590struct inode *inode = page->mapping->host;2591struct buffer_head *head = fsdata;2592struct buffer_head *bh;2593BUG_ON(fsdata != NULL && page_has_buffers(page));25942595if (unlikely(copied < len) && head)2596attach_nobh_buffers(page, head);2597if (page_has_buffers(page))2598return generic_write_end(file, mapping, pos, len,2599copied, page, fsdata);26002601SetPageUptodate(page);2602set_page_dirty(page);2603if (pos+copied > inode->i_size) {2604i_size_write(inode, pos+copied);2605mark_inode_dirty(inode);2606}26072608unlock_page(page);2609page_cache_release(page);26102611while (head) {2612bh = head;2613head = head->b_this_page;2614free_buffer_head(bh);2615}26162617return copied;2618}2619EXPORT_SYMBOL(nobh_write_end);26202621/*2622* nobh_writepage() - based on block_full_write_page() except2623* that it tries to operate without attaching bufferheads to2624* the page.2625*/2626int nobh_writepage(struct page *page, get_block_t *get_block,2627struct writeback_control *wbc)2628{2629struct inode * const inode = page->mapping->host;2630loff_t i_size = i_size_read(inode);2631const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;2632unsigned offset;2633int ret;26342635/* Is the page fully inside i_size? */2636if (page->index < end_index)2637goto out;26382639/* Is the page fully outside i_size? (truncate in progress) */2640offset = i_size & (PAGE_CACHE_SIZE-1);2641if (page->index >= end_index+1 || !offset) {2642/*2643* The page may have dirty, unmapped buffers. For example,2644* they may have been added in ext3_writepage(). Make them2645* freeable here, so the page does not leak.2646*/2647#if 02648/* Not really sure about this - do we need this ? */2649if (page->mapping->a_ops->invalidatepage)2650page->mapping->a_ops->invalidatepage(page, offset);2651#endif2652unlock_page(page);2653return 0; /* don't care */2654}26552656/*2657* The page straddles i_size. It must be zeroed out on each and every2658* writepage invocation because it may be mmapped. "A file is mapped2659* in multiples of the page size. For a file that is not a multiple of2660* the page size, the remaining memory is zeroed when mapped, and2661* writes to that region are not written out to the file."2662*/2663zero_user_segment(page, offset, PAGE_CACHE_SIZE);2664out:2665ret = mpage_writepage(page, get_block, wbc);2666if (ret == -EAGAIN)2667ret = __block_write_full_page(inode, page, get_block, wbc,2668end_buffer_async_write);2669return ret;2670}2671EXPORT_SYMBOL(nobh_writepage);26722673int nobh_truncate_page(struct address_space *mapping,2674loff_t from, get_block_t *get_block)2675{2676pgoff_t index = from >> PAGE_CACHE_SHIFT;2677unsigned offset = from & (PAGE_CACHE_SIZE-1);2678unsigned blocksize;2679sector_t iblock;2680unsigned length, pos;2681struct inode *inode = mapping->host;2682struct page *page;2683struct buffer_head map_bh;2684int err;26852686blocksize = 1 << inode->i_blkbits;2687length = offset & (blocksize - 1);26882689/* Block boundary? Nothing to do */2690if (!length)2691return 0;26922693length = blocksize - length;2694iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);26952696page = grab_cache_page(mapping, index);2697err = -ENOMEM;2698if (!page)2699goto out;27002701if (page_has_buffers(page)) {2702has_buffers:2703unlock_page(page);2704page_cache_release(page);2705return block_truncate_page(mapping, from, get_block);2706}27072708/* Find the buffer that contains "offset" */2709pos = blocksize;2710while (offset >= pos) {2711iblock++;2712pos += blocksize;2713}27142715map_bh.b_size = blocksize;2716map_bh.b_state = 0;2717err = get_block(inode, iblock, &map_bh, 0);2718if (err)2719goto unlock;2720/* unmapped? It's a hole - nothing to do */2721if (!buffer_mapped(&map_bh))2722goto unlock;27232724/* Ok, it's mapped. Make sure it's up-to-date */2725if (!PageUptodate(page)) {2726err = mapping->a_ops->readpage(NULL, page);2727if (err) {2728page_cache_release(page);2729goto out;2730}2731lock_page(page);2732if (!PageUptodate(page)) {2733err = -EIO;2734goto unlock;2735}2736if (page_has_buffers(page))2737goto has_buffers;2738}2739zero_user(page, offset, length);2740set_page_dirty(page);2741err = 0;27422743unlock:2744unlock_page(page);2745page_cache_release(page);2746out:2747return err;2748}2749EXPORT_SYMBOL(nobh_truncate_page);27502751int block_truncate_page(struct address_space *mapping,2752loff_t from, get_block_t *get_block)2753{2754pgoff_t index = from >> PAGE_CACHE_SHIFT;2755unsigned offset = from & (PAGE_CACHE_SIZE-1);2756unsigned blocksize;2757sector_t iblock;2758unsigned length, pos;2759struct inode *inode = mapping->host;2760struct page *page;2761struct buffer_head *bh;2762int err;27632764blocksize = 1 << inode->i_blkbits;2765length = offset & (blocksize - 1);27662767/* Block boundary? Nothing to do */2768if (!length)2769return 0;27702771length = blocksize - length;2772iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);27732774page = grab_cache_page(mapping, index);2775err = -ENOMEM;2776if (!page)2777goto out;27782779if (!page_has_buffers(page))2780create_empty_buffers(page, blocksize, 0);27812782/* Find the buffer that contains "offset" */2783bh = page_buffers(page);2784pos = blocksize;2785while (offset >= pos) {2786bh = bh->b_this_page;2787iblock++;2788pos += blocksize;2789}27902791err = 0;2792if (!buffer_mapped(bh)) {2793WARN_ON(bh->b_size != blocksize);2794err = get_block(inode, iblock, bh, 0);2795if (err)2796goto unlock;2797/* unmapped? It's a hole - nothing to do */2798if (!buffer_mapped(bh))2799goto unlock;2800}28012802/* Ok, it's mapped. Make sure it's up-to-date */2803if (PageUptodate(page))2804set_buffer_uptodate(bh);28052806if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {2807err = -EIO;2808ll_rw_block(READ, 1, &bh);2809wait_on_buffer(bh);2810/* Uhhuh. Read error. Complain and punt. */2811if (!buffer_uptodate(bh))2812goto unlock;2813}28142815zero_user(page, offset, length);2816mark_buffer_dirty(bh);2817err = 0;28182819unlock:2820unlock_page(page);2821page_cache_release(page);2822out:2823return err;2824}2825EXPORT_SYMBOL(block_truncate_page);28262827/*2828* The generic ->writepage function for buffer-backed address_spaces2829* this form passes in the end_io handler used to finish the IO.2830*/2831int block_write_full_page_endio(struct page *page, get_block_t *get_block,2832struct writeback_control *wbc, bh_end_io_t *handler)2833{2834struct inode * const inode = page->mapping->host;2835loff_t i_size = i_size_read(inode);2836const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;2837unsigned offset;28382839/* Is the page fully inside i_size? */2840if (page->index < end_index)2841return __block_write_full_page(inode, page, get_block, wbc,2842handler);28432844/* Is the page fully outside i_size? (truncate in progress) */2845offset = i_size & (PAGE_CACHE_SIZE-1);2846if (page->index >= end_index+1 || !offset) {2847/*2848* The page may have dirty, unmapped buffers. For example,2849* they may have been added in ext3_writepage(). Make them2850* freeable here, so the page does not leak.2851*/2852do_invalidatepage(page, 0);2853unlock_page(page);2854return 0; /* don't care */2855}28562857/*2858* The page straddles i_size. It must be zeroed out on each and every2859* writepage invocation because it may be mmapped. "A file is mapped2860* in multiples of the page size. For a file that is not a multiple of2861* the page size, the remaining memory is zeroed when mapped, and2862* writes to that region are not written out to the file."2863*/2864zero_user_segment(page, offset, PAGE_CACHE_SIZE);2865return __block_write_full_page(inode, page, get_block, wbc, handler);2866}2867EXPORT_SYMBOL(block_write_full_page_endio);28682869/*2870* The generic ->writepage function for buffer-backed address_spaces2871*/2872int block_write_full_page(struct page *page, get_block_t *get_block,2873struct writeback_control *wbc)2874{2875return block_write_full_page_endio(page, get_block, wbc,2876end_buffer_async_write);2877}2878EXPORT_SYMBOL(block_write_full_page);28792880sector_t generic_block_bmap(struct address_space *mapping, sector_t block,2881get_block_t *get_block)2882{2883struct buffer_head tmp;2884struct inode *inode = mapping->host;2885tmp.b_state = 0;2886tmp.b_blocknr = 0;2887tmp.b_size = 1 << inode->i_blkbits;2888get_block(inode, block, &tmp, 0);2889return tmp.b_blocknr;2890}2891EXPORT_SYMBOL(generic_block_bmap);28922893static void end_bio_bh_io_sync(struct bio *bio, int err)2894{2895struct buffer_head *bh = bio->bi_private;28962897if (err == -EOPNOTSUPP) {2898set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);2899}29002901if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))2902set_bit(BH_Quiet, &bh->b_state);29032904bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));2905bio_put(bio);2906}29072908int submit_bh(int rw, struct buffer_head * bh)2909{2910struct bio *bio;2911int ret = 0;29122913BUG_ON(!buffer_locked(bh));2914BUG_ON(!buffer_mapped(bh));2915BUG_ON(!bh->b_end_io);2916BUG_ON(buffer_delay(bh));2917BUG_ON(buffer_unwritten(bh));29182919/*2920* Only clear out a write error when rewriting2921*/2922if (test_set_buffer_req(bh) && (rw & WRITE))2923clear_buffer_write_io_error(bh);29242925/*2926* from here on down, it's all bio -- do the initial mapping,2927* submit_bio -> generic_make_request may further map this bio around2928*/2929bio = bio_alloc(GFP_NOIO, 1);29302931bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);2932bio->bi_bdev = bh->b_bdev;2933bio->bi_io_vec[0].bv_page = bh->b_page;2934bio->bi_io_vec[0].bv_len = bh->b_size;2935bio->bi_io_vec[0].bv_offset = bh_offset(bh);29362937bio->bi_vcnt = 1;2938bio->bi_idx = 0;2939bio->bi_size = bh->b_size;29402941bio->bi_end_io = end_bio_bh_io_sync;2942bio->bi_private = bh;29432944bio_get(bio);2945submit_bio(rw, bio);29462947if (bio_flagged(bio, BIO_EOPNOTSUPP))2948ret = -EOPNOTSUPP;29492950bio_put(bio);2951return ret;2952}2953EXPORT_SYMBOL(submit_bh);29542955/**2956* ll_rw_block: low-level access to block devices (DEPRECATED)2957* @rw: whether to %READ or %WRITE or maybe %READA (readahead)2958* @nr: number of &struct buffer_heads in the array2959* @bhs: array of pointers to &struct buffer_head2960*2961* ll_rw_block() takes an array of pointers to &struct buffer_heads, and2962* requests an I/O operation on them, either a %READ or a %WRITE. The third2963* %READA option is described in the documentation for generic_make_request()2964* which ll_rw_block() calls.2965*2966* This function drops any buffer that it cannot get a lock on (with the2967* BH_Lock state bit), any buffer that appears to be clean when doing a write2968* request, and any buffer that appears to be up-to-date when doing read2969* request. Further it marks as clean buffers that are processed for2970* writing (the buffer cache won't assume that they are actually clean2971* until the buffer gets unlocked).2972*2973* ll_rw_block sets b_end_io to simple completion handler that marks2974* the buffer up-to-date (if approriate), unlocks the buffer and wakes2975* any waiters.2976*2977* All of the buffers must be for the same device, and must also be a2978* multiple of the current approved size for the device.2979*/2980void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])2981{2982int i;29832984for (i = 0; i < nr; i++) {2985struct buffer_head *bh = bhs[i];29862987if (!trylock_buffer(bh))2988continue;2989if (rw == WRITE) {2990if (test_clear_buffer_dirty(bh)) {2991bh->b_end_io = end_buffer_write_sync;2992get_bh(bh);2993submit_bh(WRITE, bh);2994continue;2995}2996} else {2997if (!buffer_uptodate(bh)) {2998bh->b_end_io = end_buffer_read_sync;2999get_bh(bh);3000submit_bh(rw, bh);3001continue;3002}3003}3004unlock_buffer(bh);3005}3006}3007EXPORT_SYMBOL(ll_rw_block);30083009void write_dirty_buffer(struct buffer_head *bh, int rw)3010{3011lock_buffer(bh);3012if (!test_clear_buffer_dirty(bh)) {3013unlock_buffer(bh);3014return;3015}3016bh->b_end_io = end_buffer_write_sync;3017get_bh(bh);3018submit_bh(rw, bh);3019}3020EXPORT_SYMBOL(write_dirty_buffer);30213022/*3023* For a data-integrity writeout, we need to wait upon any in-progress I/O3024* and then start new I/O and then wait upon it. The caller must have a ref on3025* the buffer_head.3026*/3027int __sync_dirty_buffer(struct buffer_head *bh, int rw)3028{3029int ret = 0;30303031WARN_ON(atomic_read(&bh->b_count) < 1);3032lock_buffer(bh);3033if (test_clear_buffer_dirty(bh)) {3034get_bh(bh);3035bh->b_end_io = end_buffer_write_sync;3036ret = submit_bh(rw, bh);3037wait_on_buffer(bh);3038if (!ret && !buffer_uptodate(bh))3039ret = -EIO;3040} else {3041unlock_buffer(bh);3042}3043return ret;3044}3045EXPORT_SYMBOL(__sync_dirty_buffer);30463047int sync_dirty_buffer(struct buffer_head *bh)3048{3049return __sync_dirty_buffer(bh, WRITE_SYNC);3050}3051EXPORT_SYMBOL(sync_dirty_buffer);30523053/*3054* try_to_free_buffers() checks if all the buffers on this particular page3055* are unused, and releases them if so.3056*3057* Exclusion against try_to_free_buffers may be obtained by either3058* locking the page or by holding its mapping's private_lock.3059*3060* If the page is dirty but all the buffers are clean then we need to3061* be sure to mark the page clean as well. This is because the page3062* may be against a block device, and a later reattachment of buffers3063* to a dirty page will set *all* buffers dirty. Which would corrupt3064* filesystem data on the same device.3065*3066* The same applies to regular filesystem pages: if all the buffers are3067* clean then we set the page clean and proceed. To do that, we require3068* total exclusion from __set_page_dirty_buffers(). That is obtained with3069* private_lock.3070*3071* try_to_free_buffers() is non-blocking.3072*/3073static inline int buffer_busy(struct buffer_head *bh)3074{3075return atomic_read(&bh->b_count) |3076(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));3077}30783079static int3080drop_buffers(struct page *page, struct buffer_head **buffers_to_free)3081{3082struct buffer_head *head = page_buffers(page);3083struct buffer_head *bh;30843085bh = head;3086do {3087if (buffer_write_io_error(bh) && page->mapping)3088set_bit(AS_EIO, &page->mapping->flags);3089if (buffer_busy(bh))3090goto failed;3091bh = bh->b_this_page;3092} while (bh != head);30933094do {3095struct buffer_head *next = bh->b_this_page;30963097if (bh->b_assoc_map)3098__remove_assoc_queue(bh);3099bh = next;3100} while (bh != head);3101*buffers_to_free = head;3102__clear_page_buffers(page);3103return 1;3104failed:3105return 0;3106}31073108int try_to_free_buffers(struct page *page)3109{3110struct address_space * const mapping = page->mapping;3111struct buffer_head *buffers_to_free = NULL;3112int ret = 0;31133114BUG_ON(!PageLocked(page));3115if (PageWriteback(page))3116return 0;31173118if (mapping == NULL) { /* can this still happen? */3119ret = drop_buffers(page, &buffers_to_free);3120goto out;3121}31223123spin_lock(&mapping->private_lock);3124ret = drop_buffers(page, &buffers_to_free);31253126/*3127* If the filesystem writes its buffers by hand (eg ext3)3128* then we can have clean buffers against a dirty page. We3129* clean the page here; otherwise the VM will never notice3130* that the filesystem did any IO at all.3131*3132* Also, during truncate, discard_buffer will have marked all3133* the page's buffers clean. We discover that here and clean3134* the page also.3135*3136* private_lock must be held over this entire operation in order3137* to synchronise against __set_page_dirty_buffers and prevent the3138* dirty bit from being lost.3139*/3140if (ret)3141cancel_dirty_page(page, PAGE_CACHE_SIZE);3142spin_unlock(&mapping->private_lock);3143out:3144if (buffers_to_free) {3145struct buffer_head *bh = buffers_to_free;31463147do {3148struct buffer_head *next = bh->b_this_page;3149free_buffer_head(bh);3150bh = next;3151} while (bh != buffers_to_free);3152}3153return ret;3154}3155EXPORT_SYMBOL(try_to_free_buffers);31563157/*3158* There are no bdflush tunables left. But distributions are3159* still running obsolete flush daemons, so we terminate them here.3160*3161* Use of bdflush() is deprecated and will be removed in a future kernel.3162* The `flush-X' kernel threads fully replace bdflush daemons and this call.3163*/3164SYSCALL_DEFINE2(bdflush, int, func, long, data)3165{3166static int msg_count;31673168if (!capable(CAP_SYS_ADMIN))3169return -EPERM;31703171if (msg_count < 5) {3172msg_count++;3173printk(KERN_INFO3174"warning: process `%s' used the obsolete bdflush"3175" system call\n", current->comm);3176printk(KERN_INFO "Fix your initscripts?\n");3177}31783179if (func == 1)3180do_exit(0);3181return 0;3182}31833184/*3185* Buffer-head allocation3186*/3187static struct kmem_cache *bh_cachep;31883189/*3190* Once the number of bh's in the machine exceeds this level, we start3191* stripping them in writeback.3192*/3193static int max_buffer_heads;31943195int buffer_heads_over_limit;31963197struct bh_accounting {3198int nr; /* Number of live bh's */3199int ratelimit; /* Limit cacheline bouncing */3200};32013202static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};32033204static void recalc_bh_state(void)3205{3206int i;3207int tot = 0;32083209if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)3210return;3211__this_cpu_write(bh_accounting.ratelimit, 0);3212for_each_online_cpu(i)3213tot += per_cpu(bh_accounting, i).nr;3214buffer_heads_over_limit = (tot > max_buffer_heads);3215}32163217struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)3218{3219struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);3220if (ret) {3221INIT_LIST_HEAD(&ret->b_assoc_buffers);3222preempt_disable();3223__this_cpu_inc(bh_accounting.nr);3224recalc_bh_state();3225preempt_enable();3226}3227return ret;3228}3229EXPORT_SYMBOL(alloc_buffer_head);32303231void free_buffer_head(struct buffer_head *bh)3232{3233BUG_ON(!list_empty(&bh->b_assoc_buffers));3234kmem_cache_free(bh_cachep, bh);3235preempt_disable();3236__this_cpu_dec(bh_accounting.nr);3237recalc_bh_state();3238preempt_enable();3239}3240EXPORT_SYMBOL(free_buffer_head);32413242static void buffer_exit_cpu(int cpu)3243{3244int i;3245struct bh_lru *b = &per_cpu(bh_lrus, cpu);32463247for (i = 0; i < BH_LRU_SIZE; i++) {3248brelse(b->bhs[i]);3249b->bhs[i] = NULL;3250}3251this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);3252per_cpu(bh_accounting, cpu).nr = 0;3253}32543255static int buffer_cpu_notify(struct notifier_block *self,3256unsigned long action, void *hcpu)3257{3258if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)3259buffer_exit_cpu((unsigned long)hcpu);3260return NOTIFY_OK;3261}32623263/**3264* bh_uptodate_or_lock - Test whether the buffer is uptodate3265* @bh: struct buffer_head3266*3267* Return true if the buffer is up-to-date and false,3268* with the buffer locked, if not.3269*/3270int bh_uptodate_or_lock(struct buffer_head *bh)3271{3272if (!buffer_uptodate(bh)) {3273lock_buffer(bh);3274if (!buffer_uptodate(bh))3275return 0;3276unlock_buffer(bh);3277}3278return 1;3279}3280EXPORT_SYMBOL(bh_uptodate_or_lock);32813282/**3283* bh_submit_read - Submit a locked buffer for reading3284* @bh: struct buffer_head3285*3286* Returns zero on success and -EIO on error.3287*/3288int bh_submit_read(struct buffer_head *bh)3289{3290BUG_ON(!buffer_locked(bh));32913292if (buffer_uptodate(bh)) {3293unlock_buffer(bh);3294return 0;3295}32963297get_bh(bh);3298bh->b_end_io = end_buffer_read_sync;3299submit_bh(READ, bh);3300wait_on_buffer(bh);3301if (buffer_uptodate(bh))3302return 0;3303return -EIO;3304}3305EXPORT_SYMBOL(bh_submit_read);33063307void __init buffer_init(void)3308{3309int nrpages;33103311bh_cachep = kmem_cache_create("buffer_head",3312sizeof(struct buffer_head), 0,3313(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|3314SLAB_MEM_SPREAD),3315NULL);33163317/*3318* Limit the bh occupancy to 10% of ZONE_NORMAL3319*/3320nrpages = (nr_free_buffer_pages() * 10) / 100;3321max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));3322hotcpu_notifier(buffer_cpu_notify, 0);3323}332433253326