/*1* fs/direct-io.c2*3* Copyright (C) 2002, Linus Torvalds.4*5* O_DIRECT6*7* 04Jul2002 Andrew Morton8* Initial version9* 11Sep2002 [email protected]10* added readv/writev support.11* 29Oct2002 Andrew Morton12* rewrote bio_add_page() support.13* 30Oct2002 [email protected]14* added support for non-aligned IO.15* 06Nov2002 [email protected]16* added asynchronous IO support.17* 21Jul2003 [email protected]18* added IO completion notifier.19*/2021#include <linux/kernel.h>22#include <linux/module.h>23#include <linux/types.h>24#include <linux/fs.h>25#include <linux/mm.h>26#include <linux/slab.h>27#include <linux/highmem.h>28#include <linux/pagemap.h>29#include <linux/task_io_accounting_ops.h>30#include <linux/bio.h>31#include <linux/wait.h>32#include <linux/err.h>33#include <linux/blkdev.h>34#include <linux/buffer_head.h>35#include <linux/rwsem.h>36#include <linux/uio.h>37#include <asm/atomic.h>3839/*40* How many user pages to map in one call to get_user_pages(). This determines41* the size of a structure on the stack.42*/43#define DIO_PAGES 644445/*46* This code generally works in units of "dio_blocks". A dio_block is47* somewhere between the hard sector size and the filesystem block size. it48* is determined on a per-invocation basis. When talking to the filesystem49* we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity50* down by dio->blkfactor. Similarly, fs-blocksize quantities are converted51* to bio_block quantities by shifting left by blkfactor.52*53* If blkfactor is zero then the user's request was aligned to the filesystem's54* blocksize.55*/5657struct dio {58/* BIO submission state */59struct bio *bio; /* bio under assembly */60struct inode *inode;61int rw;62loff_t i_size; /* i_size when submitted */63int flags; /* doesn't change */64unsigned blkbits; /* doesn't change */65unsigned blkfactor; /* When we're using an alignment which66is finer than the filesystem's soft67blocksize, this specifies how much68finer. blkfactor=2 means 1/4-block69alignment. Does not change */70unsigned start_zero_done; /* flag: sub-blocksize zeroing has71been performed at the start of a72write */73int pages_in_io; /* approximate total IO pages */74size_t size; /* total request size (doesn't change)*/75sector_t block_in_file; /* Current offset into the underlying76file in dio_block units. */77unsigned blocks_available; /* At block_in_file. changes */78sector_t final_block_in_request;/* doesn't change */79unsigned first_block_in_page; /* doesn't change, Used only once */80int boundary; /* prev block is at a boundary */81int reap_counter; /* rate limit reaping */82get_block_t *get_block; /* block mapping function */83dio_iodone_t *end_io; /* IO completion function */84dio_submit_t *submit_io; /* IO submition function */85loff_t logical_offset_in_bio; /* current first logical block in bio */86sector_t final_block_in_bio; /* current final block in bio + 1 */87sector_t next_block_for_io; /* next block to be put under IO,88in dio_blocks units */89struct buffer_head map_bh; /* last get_block() result */9091/*92* Deferred addition of a page to the dio. These variables are93* private to dio_send_cur_page(), submit_page_section() and94* dio_bio_add_page().95*/96struct page *cur_page; /* The page */97unsigned cur_page_offset; /* Offset into it, in bytes */98unsigned cur_page_len; /* Nr of bytes at cur_page_offset */99sector_t cur_page_block; /* Where it starts */100loff_t cur_page_fs_offset; /* Offset in file */101102/* BIO completion state */103spinlock_t bio_lock; /* protects BIO fields below */104unsigned long refcount; /* direct_io_worker() and bios */105struct bio *bio_list; /* singly linked via bi_private */106struct task_struct *waiter; /* waiting task (NULL if none) */107108/* AIO related stuff */109struct kiocb *iocb; /* kiocb */110int is_async; /* is IO async ? */111int io_error; /* IO error in completion path */112ssize_t result; /* IO result */113114/*115* Page fetching state. These variables belong to dio_refill_pages().116*/117int curr_page; /* changes */118int total_pages; /* doesn't change */119unsigned long curr_user_address;/* changes */120121/*122* Page queue. These variables belong to dio_refill_pages() and123* dio_get_page().124*/125unsigned head; /* next page to process */126unsigned tail; /* last valid page + 1 */127int page_errors; /* errno from get_user_pages() */128129/*130* pages[] (and any fields placed after it) are not zeroed out at131* allocation time. Don't add new fields after pages[] unless you132* wish that they not be zeroed.133*/134struct page *pages[DIO_PAGES]; /* page buffer */135};136137/*138* How many pages are in the queue?139*/140static inline unsigned dio_pages_present(struct dio *dio)141{142return dio->tail - dio->head;143}144145/*146* Go grab and pin some userspace pages. Typically we'll get 64 at a time.147*/148static int dio_refill_pages(struct dio *dio)149{150int ret;151int nr_pages;152153nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);154ret = get_user_pages_fast(155dio->curr_user_address, /* Where from? */156nr_pages, /* How many pages? */157dio->rw == READ, /* Write to memory? */158&dio->pages[0]); /* Put results here */159160if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {161struct page *page = ZERO_PAGE(0);162/*163* A memory fault, but the filesystem has some outstanding164* mapped blocks. We need to use those blocks up to avoid165* leaking stale data in the file.166*/167if (dio->page_errors == 0)168dio->page_errors = ret;169page_cache_get(page);170dio->pages[0] = page;171dio->head = 0;172dio->tail = 1;173ret = 0;174goto out;175}176177if (ret >= 0) {178dio->curr_user_address += ret * PAGE_SIZE;179dio->curr_page += ret;180dio->head = 0;181dio->tail = ret;182ret = 0;183}184out:185return ret;186}187188/*189* Get another userspace page. Returns an ERR_PTR on error. Pages are190* buffered inside the dio so that we can call get_user_pages() against a191* decent number of pages, less frequently. To provide nicer use of the192* L1 cache.193*/194static struct page *dio_get_page(struct dio *dio)195{196if (dio_pages_present(dio) == 0) {197int ret;198199ret = dio_refill_pages(dio);200if (ret)201return ERR_PTR(ret);202BUG_ON(dio_pages_present(dio) == 0);203}204return dio->pages[dio->head++];205}206207/**208* dio_complete() - called when all DIO BIO I/O has been completed209* @offset: the byte offset in the file of the completed operation210*211* This releases locks as dictated by the locking type, lets interested parties212* know that a DIO operation has completed, and calculates the resulting return213* code for the operation.214*215* It lets the filesystem know if it registered an interest earlier via216* get_block. Pass the private field of the map buffer_head so that217* filesystems can use it to hold additional state between get_block calls and218* dio_complete.219*/220static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)221{222ssize_t transferred = 0;223224/*225* AIO submission can race with bio completion to get here while226* expecting to have the last io completed by bio completion.227* In that case -EIOCBQUEUED is in fact not an error we want228* to preserve through this call.229*/230if (ret == -EIOCBQUEUED)231ret = 0;232233if (dio->result) {234transferred = dio->result;235236/* Check for short read case */237if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))238transferred = dio->i_size - offset;239}240241if (ret == 0)242ret = dio->page_errors;243if (ret == 0)244ret = dio->io_error;245if (ret == 0)246ret = transferred;247248if (dio->end_io && dio->result) {249dio->end_io(dio->iocb, offset, transferred,250dio->map_bh.b_private, ret, is_async);251} else if (is_async) {252aio_complete(dio->iocb, ret, 0);253}254255if (dio->flags & DIO_LOCKING)256/* lockdep: non-owner release */257up_read_non_owner(&dio->inode->i_alloc_sem);258259return ret;260}261262static int dio_bio_complete(struct dio *dio, struct bio *bio);263/*264* Asynchronous IO callback.265*/266static void dio_bio_end_aio(struct bio *bio, int error)267{268struct dio *dio = bio->bi_private;269unsigned long remaining;270unsigned long flags;271272/* cleanup the bio */273dio_bio_complete(dio, bio);274275spin_lock_irqsave(&dio->bio_lock, flags);276remaining = --dio->refcount;277if (remaining == 1 && dio->waiter)278wake_up_process(dio->waiter);279spin_unlock_irqrestore(&dio->bio_lock, flags);280281if (remaining == 0) {282dio_complete(dio, dio->iocb->ki_pos, 0, true);283kfree(dio);284}285}286287/*288* The BIO completion handler simply queues the BIO up for the process-context289* handler.290*291* During I/O bi_private points at the dio. After I/O, bi_private is used to292* implement a singly-linked list of completed BIOs, at dio->bio_list.293*/294static void dio_bio_end_io(struct bio *bio, int error)295{296struct dio *dio = bio->bi_private;297unsigned long flags;298299spin_lock_irqsave(&dio->bio_lock, flags);300bio->bi_private = dio->bio_list;301dio->bio_list = bio;302if (--dio->refcount == 1 && dio->waiter)303wake_up_process(dio->waiter);304spin_unlock_irqrestore(&dio->bio_lock, flags);305}306307/**308* dio_end_io - handle the end io action for the given bio309* @bio: The direct io bio thats being completed310* @error: Error if there was one311*312* This is meant to be called by any filesystem that uses their own dio_submit_t313* so that the DIO specific endio actions are dealt with after the filesystem314* has done it's completion work.315*/316void dio_end_io(struct bio *bio, int error)317{318struct dio *dio = bio->bi_private;319320if (dio->is_async)321dio_bio_end_aio(bio, error);322else323dio_bio_end_io(bio, error);324}325EXPORT_SYMBOL_GPL(dio_end_io);326327static void328dio_bio_alloc(struct dio *dio, struct block_device *bdev,329sector_t first_sector, int nr_vecs)330{331struct bio *bio;332333/*334* bio_alloc() is guaranteed to return a bio when called with335* __GFP_WAIT and we request a valid number of vectors.336*/337bio = bio_alloc(GFP_KERNEL, nr_vecs);338339bio->bi_bdev = bdev;340bio->bi_sector = first_sector;341if (dio->is_async)342bio->bi_end_io = dio_bio_end_aio;343else344bio->bi_end_io = dio_bio_end_io;345346dio->bio = bio;347dio->logical_offset_in_bio = dio->cur_page_fs_offset;348}349350/*351* In the AIO read case we speculatively dirty the pages before starting IO.352* During IO completion, any of these pages which happen to have been written353* back will be redirtied by bio_check_pages_dirty().354*355* bios hold a dio reference between submit_bio and ->end_io.356*/357static void dio_bio_submit(struct dio *dio)358{359struct bio *bio = dio->bio;360unsigned long flags;361362bio->bi_private = dio;363364spin_lock_irqsave(&dio->bio_lock, flags);365dio->refcount++;366spin_unlock_irqrestore(&dio->bio_lock, flags);367368if (dio->is_async && dio->rw == READ)369bio_set_pages_dirty(bio);370371if (dio->submit_io)372dio->submit_io(dio->rw, bio, dio->inode,373dio->logical_offset_in_bio);374else375submit_bio(dio->rw, bio);376377dio->bio = NULL;378dio->boundary = 0;379dio->logical_offset_in_bio = 0;380}381382/*383* Release any resources in case of a failure384*/385static void dio_cleanup(struct dio *dio)386{387while (dio_pages_present(dio))388page_cache_release(dio_get_page(dio));389}390391/*392* Wait for the next BIO to complete. Remove it and return it. NULL is393* returned once all BIOs have been completed. This must only be called once394* all bios have been issued so that dio->refcount can only decrease. This395* requires that that the caller hold a reference on the dio.396*/397static struct bio *dio_await_one(struct dio *dio)398{399unsigned long flags;400struct bio *bio = NULL;401402spin_lock_irqsave(&dio->bio_lock, flags);403404/*405* Wait as long as the list is empty and there are bios in flight. bio406* completion drops the count, maybe adds to the list, and wakes while407* holding the bio_lock so we don't need set_current_state()'s barrier408* and can call it after testing our condition.409*/410while (dio->refcount > 1 && dio->bio_list == NULL) {411__set_current_state(TASK_UNINTERRUPTIBLE);412dio->waiter = current;413spin_unlock_irqrestore(&dio->bio_lock, flags);414io_schedule();415/* wake up sets us TASK_RUNNING */416spin_lock_irqsave(&dio->bio_lock, flags);417dio->waiter = NULL;418}419if (dio->bio_list) {420bio = dio->bio_list;421dio->bio_list = bio->bi_private;422}423spin_unlock_irqrestore(&dio->bio_lock, flags);424return bio;425}426427/*428* Process one completed BIO. No locks are held.429*/430static int dio_bio_complete(struct dio *dio, struct bio *bio)431{432const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);433struct bio_vec *bvec = bio->bi_io_vec;434int page_no;435436if (!uptodate)437dio->io_error = -EIO;438439if (dio->is_async && dio->rw == READ) {440bio_check_pages_dirty(bio); /* transfers ownership */441} else {442for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {443struct page *page = bvec[page_no].bv_page;444445if (dio->rw == READ && !PageCompound(page))446set_page_dirty_lock(page);447page_cache_release(page);448}449bio_put(bio);450}451return uptodate ? 0 : -EIO;452}453454/*455* Wait on and process all in-flight BIOs. This must only be called once456* all bios have been issued so that the refcount can only decrease.457* This just waits for all bios to make it through dio_bio_complete. IO458* errors are propagated through dio->io_error and should be propagated via459* dio_complete().460*/461static void dio_await_completion(struct dio *dio)462{463struct bio *bio;464do {465bio = dio_await_one(dio);466if (bio)467dio_bio_complete(dio, bio);468} while (bio);469}470471/*472* A really large O_DIRECT read or write can generate a lot of BIOs. So473* to keep the memory consumption sane we periodically reap any completed BIOs474* during the BIO generation phase.475*476* This also helps to limit the peak amount of pinned userspace memory.477*/478static int dio_bio_reap(struct dio *dio)479{480int ret = 0;481482if (dio->reap_counter++ >= 64) {483while (dio->bio_list) {484unsigned long flags;485struct bio *bio;486int ret2;487488spin_lock_irqsave(&dio->bio_lock, flags);489bio = dio->bio_list;490dio->bio_list = bio->bi_private;491spin_unlock_irqrestore(&dio->bio_lock, flags);492ret2 = dio_bio_complete(dio, bio);493if (ret == 0)494ret = ret2;495}496dio->reap_counter = 0;497}498return ret;499}500501/*502* Call into the fs to map some more disk blocks. We record the current number503* of available blocks at dio->blocks_available. These are in units of the504* fs blocksize, (1 << inode->i_blkbits).505*506* The fs is allowed to map lots of blocks at once. If it wants to do that,507* it uses the passed inode-relative block number as the file offset, as usual.508*509* get_block() is passed the number of i_blkbits-sized blocks which direct_io510* has remaining to do. The fs should not map more than this number of blocks.511*512* If the fs has mapped a lot of blocks, it should populate bh->b_size to513* indicate how much contiguous disk space has been made available at514* bh->b_blocknr.515*516* If *any* of the mapped blocks are new, then the fs must set buffer_new().517* This isn't very efficient...518*519* In the case of filesystem holes: the fs may return an arbitrarily-large520* hole by returning an appropriate value in b_size and by clearing521* buffer_mapped(). However the direct-io code will only process holes one522* block at a time - it will repeatedly call get_block() as it walks the hole.523*/524static int get_more_blocks(struct dio *dio)525{526int ret;527struct buffer_head *map_bh = &dio->map_bh;528sector_t fs_startblk; /* Into file, in filesystem-sized blocks */529unsigned long fs_count; /* Number of filesystem-sized blocks */530unsigned long dio_count;/* Number of dio_block-sized blocks */531unsigned long blkmask;532int create;533534/*535* If there was a memory error and we've overwritten all the536* mapped blocks then we can now return that memory error537*/538ret = dio->page_errors;539if (ret == 0) {540BUG_ON(dio->block_in_file >= dio->final_block_in_request);541fs_startblk = dio->block_in_file >> dio->blkfactor;542dio_count = dio->final_block_in_request - dio->block_in_file;543fs_count = dio_count >> dio->blkfactor;544blkmask = (1 << dio->blkfactor) - 1;545if (dio_count & blkmask)546fs_count++;547548map_bh->b_state = 0;549map_bh->b_size = fs_count << dio->inode->i_blkbits;550551/*552* For writes inside i_size on a DIO_SKIP_HOLES filesystem we553* forbid block creations: only overwrites are permitted.554* We will return early to the caller once we see an555* unmapped buffer head returned, and the caller will fall556* back to buffered I/O.557*558* Otherwise the decision is left to the get_blocks method,559* which may decide to handle it or also return an unmapped560* buffer head.561*/562create = dio->rw & WRITE;563if (dio->flags & DIO_SKIP_HOLES) {564if (dio->block_in_file < (i_size_read(dio->inode) >>565dio->blkbits))566create = 0;567}568569ret = (*dio->get_block)(dio->inode, fs_startblk,570map_bh, create);571}572return ret;573}574575/*576* There is no bio. Make one now.577*/578static int dio_new_bio(struct dio *dio, sector_t start_sector)579{580sector_t sector;581int ret, nr_pages;582583ret = dio_bio_reap(dio);584if (ret)585goto out;586sector = start_sector << (dio->blkbits - 9);587nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));588nr_pages = min(nr_pages, BIO_MAX_PAGES);589BUG_ON(nr_pages <= 0);590dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);591dio->boundary = 0;592out:593return ret;594}595596/*597* Attempt to put the current chunk of 'cur_page' into the current BIO. If598* that was successful then update final_block_in_bio and take a ref against599* the just-added page.600*601* Return zero on success. Non-zero means the caller needs to start a new BIO.602*/603static int dio_bio_add_page(struct dio *dio)604{605int ret;606607ret = bio_add_page(dio->bio, dio->cur_page,608dio->cur_page_len, dio->cur_page_offset);609if (ret == dio->cur_page_len) {610/*611* Decrement count only, if we are done with this page612*/613if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)614dio->pages_in_io--;615page_cache_get(dio->cur_page);616dio->final_block_in_bio = dio->cur_page_block +617(dio->cur_page_len >> dio->blkbits);618ret = 0;619} else {620ret = 1;621}622return ret;623}624625/*626* Put cur_page under IO. The section of cur_page which is described by627* cur_page_offset,cur_page_len is put into a BIO. The section of cur_page628* starts on-disk at cur_page_block.629*630* We take a ref against the page here (on behalf of its presence in the bio).631*632* The caller of this function is responsible for removing cur_page from the633* dio, and for dropping the refcount which came from that presence.634*/635static int dio_send_cur_page(struct dio *dio)636{637int ret = 0;638639if (dio->bio) {640loff_t cur_offset = dio->cur_page_fs_offset;641loff_t bio_next_offset = dio->logical_offset_in_bio +642dio->bio->bi_size;643644/*645* See whether this new request is contiguous with the old.646*647* Btrfs cannot handle having logically non-contiguous requests648* submitted. For example if you have649*650* Logical: [0-4095][HOLE][8192-12287]651* Physical: [0-4095] [4096-8191]652*653* We cannot submit those pages together as one BIO. So if our654* current logical offset in the file does not equal what would655* be the next logical offset in the bio, submit the bio we656* have.657*/658if (dio->final_block_in_bio != dio->cur_page_block ||659cur_offset != bio_next_offset)660dio_bio_submit(dio);661/*662* Submit now if the underlying fs is about to perform a663* metadata read664*/665else if (dio->boundary)666dio_bio_submit(dio);667}668669if (dio->bio == NULL) {670ret = dio_new_bio(dio, dio->cur_page_block);671if (ret)672goto out;673}674675if (dio_bio_add_page(dio) != 0) {676dio_bio_submit(dio);677ret = dio_new_bio(dio, dio->cur_page_block);678if (ret == 0) {679ret = dio_bio_add_page(dio);680BUG_ON(ret != 0);681}682}683out:684return ret;685}686687/*688* An autonomous function to put a chunk of a page under deferred IO.689*690* The caller doesn't actually know (or care) whether this piece of page is in691* a BIO, or is under IO or whatever. We just take care of all possible692* situations here. The separation between the logic of do_direct_IO() and693* that of submit_page_section() is important for clarity. Please don't break.694*695* The chunk of page starts on-disk at blocknr.696*697* We perform deferred IO, by recording the last-submitted page inside our698* private part of the dio structure. If possible, we just expand the IO699* across that page here.700*701* If that doesn't work out then we put the old page into the bio and add this702* page to the dio instead.703*/704static int705submit_page_section(struct dio *dio, struct page *page,706unsigned offset, unsigned len, sector_t blocknr)707{708int ret = 0;709710if (dio->rw & WRITE) {711/*712* Read accounting is performed in submit_bio()713*/714task_io_account_write(len);715}716717/*718* Can we just grow the current page's presence in the dio?719*/720if ( (dio->cur_page == page) &&721(dio->cur_page_offset + dio->cur_page_len == offset) &&722(dio->cur_page_block +723(dio->cur_page_len >> dio->blkbits) == blocknr)) {724dio->cur_page_len += len;725726/*727* If dio->boundary then we want to schedule the IO now to728* avoid metadata seeks.729*/730if (dio->boundary) {731ret = dio_send_cur_page(dio);732page_cache_release(dio->cur_page);733dio->cur_page = NULL;734}735goto out;736}737738/*739* If there's a deferred page already there then send it.740*/741if (dio->cur_page) {742ret = dio_send_cur_page(dio);743page_cache_release(dio->cur_page);744dio->cur_page = NULL;745if (ret)746goto out;747}748749page_cache_get(page); /* It is in dio */750dio->cur_page = page;751dio->cur_page_offset = offset;752dio->cur_page_len = len;753dio->cur_page_block = blocknr;754dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;755out:756return ret;757}758759/*760* Clean any dirty buffers in the blockdev mapping which alias newly-created761* file blocks. Only called for S_ISREG files - blockdevs do not set762* buffer_new763*/764static void clean_blockdev_aliases(struct dio *dio)765{766unsigned i;767unsigned nblocks;768769nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;770771for (i = 0; i < nblocks; i++) {772unmap_underlying_metadata(dio->map_bh.b_bdev,773dio->map_bh.b_blocknr + i);774}775}776777/*778* If we are not writing the entire block and get_block() allocated779* the block for us, we need to fill-in the unused portion of the780* block with zeros. This happens only if user-buffer, fileoffset or781* io length is not filesystem block-size multiple.782*783* `end' is zero if we're doing the start of the IO, 1 at the end of the784* IO.785*/786static void dio_zero_block(struct dio *dio, int end)787{788unsigned dio_blocks_per_fs_block;789unsigned this_chunk_blocks; /* In dio_blocks */790unsigned this_chunk_bytes;791struct page *page;792793dio->start_zero_done = 1;794if (!dio->blkfactor || !buffer_new(&dio->map_bh))795return;796797dio_blocks_per_fs_block = 1 << dio->blkfactor;798this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);799800if (!this_chunk_blocks)801return;802803/*804* We need to zero out part of an fs block. It is either at the805* beginning or the end of the fs block.806*/807if (end)808this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;809810this_chunk_bytes = this_chunk_blocks << dio->blkbits;811812page = ZERO_PAGE(0);813if (submit_page_section(dio, page, 0, this_chunk_bytes,814dio->next_block_for_io))815return;816817dio->next_block_for_io += this_chunk_blocks;818}819820/*821* Walk the user pages, and the file, mapping blocks to disk and generating822* a sequence of (page,offset,len,block) mappings. These mappings are injected823* into submit_page_section(), which takes care of the next stage of submission824*825* Direct IO against a blockdev is different from a file. Because we can826* happily perform page-sized but 512-byte aligned IOs. It is important that827* blockdev IO be able to have fine alignment and large sizes.828*829* So what we do is to permit the ->get_block function to populate bh.b_size830* with the size of IO which is permitted at this offset and this i_blkbits.831*832* For best results, the blockdev should be set up with 512-byte i_blkbits and833* it should set b_size to PAGE_SIZE or more inside get_block(). This gives834* fine alignment but still allows this function to work in PAGE_SIZE units.835*/836static int do_direct_IO(struct dio *dio)837{838const unsigned blkbits = dio->blkbits;839const unsigned blocks_per_page = PAGE_SIZE >> blkbits;840struct page *page;841unsigned block_in_page;842struct buffer_head *map_bh = &dio->map_bh;843int ret = 0;844845/* The I/O can start at any block offset within the first page */846block_in_page = dio->first_block_in_page;847848while (dio->block_in_file < dio->final_block_in_request) {849page = dio_get_page(dio);850if (IS_ERR(page)) {851ret = PTR_ERR(page);852goto out;853}854855while (block_in_page < blocks_per_page) {856unsigned offset_in_page = block_in_page << blkbits;857unsigned this_chunk_bytes; /* # of bytes mapped */858unsigned this_chunk_blocks; /* # of blocks */859unsigned u;860861if (dio->blocks_available == 0) {862/*863* Need to go and map some more disk864*/865unsigned long blkmask;866unsigned long dio_remainder;867868ret = get_more_blocks(dio);869if (ret) {870page_cache_release(page);871goto out;872}873if (!buffer_mapped(map_bh))874goto do_holes;875876dio->blocks_available =877map_bh->b_size >> dio->blkbits;878dio->next_block_for_io =879map_bh->b_blocknr << dio->blkfactor;880if (buffer_new(map_bh))881clean_blockdev_aliases(dio);882883if (!dio->blkfactor)884goto do_holes;885886blkmask = (1 << dio->blkfactor) - 1;887dio_remainder = (dio->block_in_file & blkmask);888889/*890* If we are at the start of IO and that IO891* starts partway into a fs-block,892* dio_remainder will be non-zero. If the IO893* is a read then we can simply advance the IO894* cursor to the first block which is to be895* read. But if the IO is a write and the896* block was newly allocated we cannot do that;897* the start of the fs block must be zeroed out898* on-disk899*/900if (!buffer_new(map_bh))901dio->next_block_for_io += dio_remainder;902dio->blocks_available -= dio_remainder;903}904do_holes:905/* Handle holes */906if (!buffer_mapped(map_bh)) {907loff_t i_size_aligned;908909/* AKPM: eargh, -ENOTBLK is a hack */910if (dio->rw & WRITE) {911page_cache_release(page);912return -ENOTBLK;913}914915/*916* Be sure to account for a partial block as the917* last block in the file918*/919i_size_aligned = ALIGN(i_size_read(dio->inode),9201 << blkbits);921if (dio->block_in_file >=922i_size_aligned >> blkbits) {923/* We hit eof */924page_cache_release(page);925goto out;926}927zero_user(page, block_in_page << blkbits,9281 << blkbits);929dio->block_in_file++;930block_in_page++;931goto next_block;932}933934/*935* If we're performing IO which has an alignment which936* is finer than the underlying fs, go check to see if937* we must zero out the start of this block.938*/939if (unlikely(dio->blkfactor && !dio->start_zero_done))940dio_zero_block(dio, 0);941942/*943* Work out, in this_chunk_blocks, how much disk we944* can add to this page945*/946this_chunk_blocks = dio->blocks_available;947u = (PAGE_SIZE - offset_in_page) >> blkbits;948if (this_chunk_blocks > u)949this_chunk_blocks = u;950u = dio->final_block_in_request - dio->block_in_file;951if (this_chunk_blocks > u)952this_chunk_blocks = u;953this_chunk_bytes = this_chunk_blocks << blkbits;954BUG_ON(this_chunk_bytes == 0);955956dio->boundary = buffer_boundary(map_bh);957ret = submit_page_section(dio, page, offset_in_page,958this_chunk_bytes, dio->next_block_for_io);959if (ret) {960page_cache_release(page);961goto out;962}963dio->next_block_for_io += this_chunk_blocks;964965dio->block_in_file += this_chunk_blocks;966block_in_page += this_chunk_blocks;967dio->blocks_available -= this_chunk_blocks;968next_block:969BUG_ON(dio->block_in_file > dio->final_block_in_request);970if (dio->block_in_file == dio->final_block_in_request)971break;972}973974/* Drop the ref which was taken in get_user_pages() */975page_cache_release(page);976block_in_page = 0;977}978out:979return ret;980}981982/*983* Releases both i_mutex and i_alloc_sem984*/985static ssize_t986direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,987const struct iovec *iov, loff_t offset, unsigned long nr_segs,988unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,989dio_submit_t submit_io, struct dio *dio)990{991unsigned long user_addr;992unsigned long flags;993int seg;994ssize_t ret = 0;995ssize_t ret2;996size_t bytes;997998dio->inode = inode;999dio->rw = rw;1000dio->blkbits = blkbits;1001dio->blkfactor = inode->i_blkbits - blkbits;1002dio->block_in_file = offset >> blkbits;10031004dio->get_block = get_block;1005dio->end_io = end_io;1006dio->submit_io = submit_io;1007dio->final_block_in_bio = -1;1008dio->next_block_for_io = -1;10091010dio->iocb = iocb;1011dio->i_size = i_size_read(inode);10121013spin_lock_init(&dio->bio_lock);1014dio->refcount = 1;10151016/*1017* In case of non-aligned buffers, we may need 2 more1018* pages since we need to zero out first and last block.1019*/1020if (unlikely(dio->blkfactor))1021dio->pages_in_io = 2;10221023for (seg = 0; seg < nr_segs; seg++) {1024user_addr = (unsigned long)iov[seg].iov_base;1025dio->pages_in_io +=1026((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE1027- user_addr/PAGE_SIZE);1028}10291030for (seg = 0; seg < nr_segs; seg++) {1031user_addr = (unsigned long)iov[seg].iov_base;1032dio->size += bytes = iov[seg].iov_len;10331034/* Index into the first page of the first block */1035dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;1036dio->final_block_in_request = dio->block_in_file +1037(bytes >> blkbits);1038/* Page fetching state */1039dio->head = 0;1040dio->tail = 0;1041dio->curr_page = 0;10421043dio->total_pages = 0;1044if (user_addr & (PAGE_SIZE-1)) {1045dio->total_pages++;1046bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));1047}1048dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;1049dio->curr_user_address = user_addr;10501051ret = do_direct_IO(dio);10521053dio->result += iov[seg].iov_len -1054((dio->final_block_in_request - dio->block_in_file) <<1055blkbits);10561057if (ret) {1058dio_cleanup(dio);1059break;1060}1061} /* end iovec loop */10621063if (ret == -ENOTBLK) {1064/*1065* The remaining part of the request will be1066* be handled by buffered I/O when we return1067*/1068ret = 0;1069}1070/*1071* There may be some unwritten disk at the end of a part-written1072* fs-block-sized block. Go zero that now.1073*/1074dio_zero_block(dio, 1);10751076if (dio->cur_page) {1077ret2 = dio_send_cur_page(dio);1078if (ret == 0)1079ret = ret2;1080page_cache_release(dio->cur_page);1081dio->cur_page = NULL;1082}1083if (dio->bio)1084dio_bio_submit(dio);10851086/*1087* It is possible that, we return short IO due to end of file.1088* In that case, we need to release all the pages we got hold on.1089*/1090dio_cleanup(dio);10911092/*1093* All block lookups have been performed. For READ requests1094* we can let i_mutex go now that its achieved its purpose1095* of protecting us from looking up uninitialized blocks.1096*/1097if (rw == READ && (dio->flags & DIO_LOCKING))1098mutex_unlock(&dio->inode->i_mutex);10991100/*1101* The only time we want to leave bios in flight is when a successful1102* partial aio read or full aio write have been setup. In that case1103* bio completion will call aio_complete. The only time it's safe to1104* call aio_complete is when we return -EIOCBQUEUED, so we key on that.1105* This had *better* be the only place that raises -EIOCBQUEUED.1106*/1107BUG_ON(ret == -EIOCBQUEUED);1108if (dio->is_async && ret == 0 && dio->result &&1109((rw & READ) || (dio->result == dio->size)))1110ret = -EIOCBQUEUED;11111112if (ret != -EIOCBQUEUED)1113dio_await_completion(dio);11141115/*1116* Sync will always be dropping the final ref and completing the1117* operation. AIO can if it was a broken operation described above or1118* in fact if all the bios race to complete before we get here. In1119* that case dio_complete() translates the EIOCBQUEUED into the proper1120* return code that the caller will hand to aio_complete().1121*1122* This is managed by the bio_lock instead of being an atomic_t so that1123* completion paths can drop their ref and use the remaining count to1124* decide to wake the submission path atomically.1125*/1126spin_lock_irqsave(&dio->bio_lock, flags);1127ret2 = --dio->refcount;1128spin_unlock_irqrestore(&dio->bio_lock, flags);11291130if (ret2 == 0) {1131ret = dio_complete(dio, offset, ret, false);1132kfree(dio);1133} else1134BUG_ON(ret != -EIOCBQUEUED);11351136return ret;1137}11381139/*1140* This is a library function for use by filesystem drivers.1141*1142* The locking rules are governed by the flags parameter:1143* - if the flags value contains DIO_LOCKING we use a fancy locking1144* scheme for dumb filesystems.1145* For writes this function is called under i_mutex and returns with1146* i_mutex held, for reads, i_mutex is not held on entry, but it is1147* taken and dropped again before returning.1148* For reads and writes i_alloc_sem is taken in shared mode and released1149* on I/O completion (which may happen asynchronously after returning to1150* the caller).1151*1152* - if the flags value does NOT contain DIO_LOCKING we don't use any1153* internal locking but rather rely on the filesystem to synchronize1154* direct I/O reads/writes versus each other and truncate.1155* For reads and writes both i_mutex and i_alloc_sem are not held on1156* entry and are never taken.1157*/1158ssize_t1159__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,1160struct block_device *bdev, const struct iovec *iov, loff_t offset,1161unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,1162dio_submit_t submit_io, int flags)1163{1164int seg;1165size_t size;1166unsigned long addr;1167unsigned blkbits = inode->i_blkbits;1168unsigned bdev_blkbits = 0;1169unsigned blocksize_mask = (1 << blkbits) - 1;1170ssize_t retval = -EINVAL;1171loff_t end = offset;1172struct dio *dio;11731174if (rw & WRITE)1175rw = WRITE_ODIRECT;11761177if (bdev)1178bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));11791180if (offset & blocksize_mask) {1181if (bdev)1182blkbits = bdev_blkbits;1183blocksize_mask = (1 << blkbits) - 1;1184if (offset & blocksize_mask)1185goto out;1186}11871188/* Check the memory alignment. Blocks cannot straddle pages */1189for (seg = 0; seg < nr_segs; seg++) {1190addr = (unsigned long)iov[seg].iov_base;1191size = iov[seg].iov_len;1192end += size;1193if ((addr & blocksize_mask) || (size & blocksize_mask)) {1194if (bdev)1195blkbits = bdev_blkbits;1196blocksize_mask = (1 << blkbits) - 1;1197if ((addr & blocksize_mask) || (size & blocksize_mask))1198goto out;1199}1200}12011202dio = kmalloc(sizeof(*dio), GFP_KERNEL);1203retval = -ENOMEM;1204if (!dio)1205goto out;1206/*1207* Believe it or not, zeroing out the page array caused a .5%1208* performance regression in a database benchmark. So, we take1209* care to only zero out what's needed.1210*/1211memset(dio, 0, offsetof(struct dio, pages));12121213dio->flags = flags;1214if (dio->flags & DIO_LOCKING) {1215/* watch out for a 0 len io from a tricksy fs */1216if (rw == READ && end > offset) {1217struct address_space *mapping =1218iocb->ki_filp->f_mapping;12191220/* will be released by direct_io_worker */1221mutex_lock(&inode->i_mutex);12221223retval = filemap_write_and_wait_range(mapping, offset,1224end - 1);1225if (retval) {1226mutex_unlock(&inode->i_mutex);1227kfree(dio);1228goto out;1229}1230}12311232/*1233* Will be released at I/O completion, possibly in a1234* different thread.1235*/1236down_read_non_owner(&inode->i_alloc_sem);1237}12381239/*1240* For file extending writes updating i_size before data1241* writeouts complete can expose uninitialized blocks. So1242* even for AIO, we need to wait for i/o to complete before1243* returning in this case.1244*/1245dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&1246(end > i_size_read(inode)));12471248retval = direct_io_worker(rw, iocb, inode, iov, offset,1249nr_segs, blkbits, get_block, end_io,1250submit_io, dio);12511252out:1253return retval;1254}1255EXPORT_SYMBOL(__blockdev_direct_IO);125612571258