/*1* Functions to sequence FLUSH and FUA writes.2*3* Copyright (C) 2011 Max Planck Institute for Gravitational Physics4* Copyright (C) 2011 Tejun Heo <[email protected]>5*6* This file is released under the GPLv2.7*8* REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three9* optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request10* properties and hardware capability.11*12* If a request doesn't have data, only REQ_FLUSH makes sense, which13* indicates a simple flush request. If there is data, REQ_FLUSH indicates14* that the device cache should be flushed before the data is executed, and15* REQ_FUA means that the data must be on non-volatile media on request16* completion.17*18* If the device doesn't have writeback cache, FLUSH and FUA don't make any19* difference. The requests are either completed immediately if there's no20* data or executed as normal requests otherwise.21*22* If the device has writeback cache and supports FUA, REQ_FLUSH is23* translated to PREFLUSH but REQ_FUA is passed down directly with DATA.24*25* If the device has writeback cache and doesn't support FUA, REQ_FLUSH is26* translated to PREFLUSH and REQ_FUA to POSTFLUSH.27*28* The actual execution of flush is double buffered. Whenever a request29* needs to execute PRE or POSTFLUSH, it queues at30* q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a31* flush is issued and the pending_idx is toggled. When the flush32* completes, all the requests which were pending are proceeded to the next33* step. This allows arbitrary merging of different types of FLUSH/FUA34* requests.35*36* Currently, the following conditions are used to determine when to issue37* flush.38*39* C1. At any given time, only one flush shall be in progress. This makes40* double buffering sufficient.41*42* C2. Flush is deferred if any request is executing DATA of its sequence.43* This avoids issuing separate POSTFLUSHes for requests which shared44* PREFLUSH.45*46* C3. The second condition is ignored if there is a request which has47* waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid48* starvation in the unlikely case where there are continuous stream of49* FUA (without FLUSH) requests.50*51* For devices which support FUA, it isn't clear whether C2 (and thus C3)52* is beneficial.53*54* Note that a sequenced FLUSH/FUA request with DATA is completed twice.55* Once while executing DATA and again after the whole sequence is56* complete. The first completion updates the contained bio but doesn't57* finish it so that the bio submitter is notified only after the whole58* sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in59* req_bio_endio().60*61* The above peculiarity requires that each FLUSH/FUA request has only one62* bio attached to it, which is guaranteed as they aren't allowed to be63* merged in the usual way.64*/6566#include <linux/kernel.h>67#include <linux/module.h>68#include <linux/bio.h>69#include <linux/blkdev.h>70#include <linux/gfp.h>7172#include "blk.h"7374/* FLUSH/FUA sequences */75enum {76REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */77REQ_FSEQ_DATA = (1 << 1), /* data write in progress */78REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */79REQ_FSEQ_DONE = (1 << 3),8081REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |82REQ_FSEQ_POSTFLUSH,8384/*85* If flush has been pending longer than the following timeout,86* it's issued even if flush_data requests are still in flight.87*/88FLUSH_PENDING_TIMEOUT = 5 * HZ,89};9091static bool blk_kick_flush(struct request_queue *q);9293static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)94{95unsigned int policy = 0;9697if (fflags & REQ_FLUSH) {98if (rq->cmd_flags & REQ_FLUSH)99policy |= REQ_FSEQ_PREFLUSH;100if (blk_rq_sectors(rq))101policy |= REQ_FSEQ_DATA;102if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))103policy |= REQ_FSEQ_POSTFLUSH;104}105return policy;106}107108static unsigned int blk_flush_cur_seq(struct request *rq)109{110return 1 << ffz(rq->flush.seq);111}112113static void blk_flush_restore_request(struct request *rq)114{115/*116* After flush data completion, @rq->bio is %NULL but we need to117* complete the bio again. @rq->biotail is guaranteed to equal the118* original @rq->bio. Restore it.119*/120rq->bio = rq->biotail;121122/* make @rq a normal request */123rq->cmd_flags &= ~REQ_FLUSH_SEQ;124rq->end_io = NULL;125}126127/**128* blk_flush_complete_seq - complete flush sequence129* @rq: FLUSH/FUA request being sequenced130* @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)131* @error: whether an error occurred132*133* @rq just completed @seq part of its flush sequence, record the134* completion and trigger the next step.135*136* CONTEXT:137* spin_lock_irq(q->queue_lock)138*139* RETURNS:140* %true if requests were added to the dispatch queue, %false otherwise.141*/142static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,143int error)144{145struct request_queue *q = rq->q;146struct list_head *pending = &q->flush_queue[q->flush_pending_idx];147bool queued = false;148149BUG_ON(rq->flush.seq & seq);150rq->flush.seq |= seq;151152if (likely(!error))153seq = blk_flush_cur_seq(rq);154else155seq = REQ_FSEQ_DONE;156157switch (seq) {158case REQ_FSEQ_PREFLUSH:159case REQ_FSEQ_POSTFLUSH:160/* queue for flush */161if (list_empty(pending))162q->flush_pending_since = jiffies;163list_move_tail(&rq->flush.list, pending);164break;165166case REQ_FSEQ_DATA:167list_move_tail(&rq->flush.list, &q->flush_data_in_flight);168list_add(&rq->queuelist, &q->queue_head);169queued = true;170break;171172case REQ_FSEQ_DONE:173/*174* @rq was previously adjusted by blk_flush_issue() for175* flush sequencing and may already have gone through the176* flush data request completion path. Restore @rq for177* normal completion and end it.178*/179BUG_ON(!list_empty(&rq->queuelist));180list_del_init(&rq->flush.list);181blk_flush_restore_request(rq);182__blk_end_request_all(rq, error);183break;184185default:186BUG();187}188189return blk_kick_flush(q) | queued;190}191192static void flush_end_io(struct request *flush_rq, int error)193{194struct request_queue *q = flush_rq->q;195struct list_head *running = &q->flush_queue[q->flush_running_idx];196bool queued = false;197struct request *rq, *n;198199BUG_ON(q->flush_pending_idx == q->flush_running_idx);200201/* account completion of the flush request */202q->flush_running_idx ^= 1;203elv_completed_request(q, flush_rq);204205/* and push the waiting requests to the next stage */206list_for_each_entry_safe(rq, n, running, flush.list) {207unsigned int seq = blk_flush_cur_seq(rq);208209BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);210queued |= blk_flush_complete_seq(rq, seq, error);211}212213/*214* Kick the queue to avoid stall for two cases:215* 1. Moving a request silently to empty queue_head may stall the216* queue.217* 2. When flush request is running in non-queueable queue, the218* queue is hold. Restart the queue after flush request is finished219* to avoid stall.220* This function is called from request completion path and calling221* directly into request_fn may confuse the driver. Always use222* kblockd.223*/224if (queued || q->flush_queue_delayed)225blk_run_queue_async(q);226q->flush_queue_delayed = 0;227}228229/**230* blk_kick_flush - consider issuing flush request231* @q: request_queue being kicked232*233* Flush related states of @q have changed, consider issuing flush request.234* Please read the comment at the top of this file for more info.235*236* CONTEXT:237* spin_lock_irq(q->queue_lock)238*239* RETURNS:240* %true if flush was issued, %false otherwise.241*/242static bool blk_kick_flush(struct request_queue *q)243{244struct list_head *pending = &q->flush_queue[q->flush_pending_idx];245struct request *first_rq =246list_first_entry(pending, struct request, flush.list);247248/* C1 described at the top of this file */249if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))250return false;251252/* C2 and C3 */253if (!list_empty(&q->flush_data_in_flight) &&254time_before(jiffies,255q->flush_pending_since + FLUSH_PENDING_TIMEOUT))256return false;257258/*259* Issue flush and toggle pending_idx. This makes pending_idx260* different from running_idx, which means flush is in flight.261*/262blk_rq_init(q, &q->flush_rq);263q->flush_rq.cmd_type = REQ_TYPE_FS;264q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;265q->flush_rq.rq_disk = first_rq->rq_disk;266q->flush_rq.end_io = flush_end_io;267268q->flush_pending_idx ^= 1;269list_add_tail(&q->flush_rq.queuelist, &q->queue_head);270return true;271}272273static void flush_data_end_io(struct request *rq, int error)274{275struct request_queue *q = rq->q;276277/*278* After populating an empty queue, kick it to avoid stall. Read279* the comment in flush_end_io().280*/281if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))282blk_run_queue_async(q);283}284285/**286* blk_insert_flush - insert a new FLUSH/FUA request287* @rq: request to insert288*289* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.290* @rq is being submitted. Analyze what needs to be done and put it on the291* right queue.292*293* CONTEXT:294* spin_lock_irq(q->queue_lock)295*/296void blk_insert_flush(struct request *rq)297{298struct request_queue *q = rq->q;299unsigned int fflags = q->flush_flags; /* may change, cache */300unsigned int policy = blk_flush_policy(fflags, rq);301302BUG_ON(rq->end_io);303BUG_ON(!rq->bio || rq->bio != rq->biotail);304305/*306* @policy now records what operations need to be done. Adjust307* REQ_FLUSH and FUA for the driver.308*/309rq->cmd_flags &= ~REQ_FLUSH;310if (!(fflags & REQ_FUA))311rq->cmd_flags &= ~REQ_FUA;312313/*314* If there's data but flush is not necessary, the request can be315* processed directly without going through flush machinery. Queue316* for normal execution.317*/318if ((policy & REQ_FSEQ_DATA) &&319!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {320list_add_tail(&rq->queuelist, &q->queue_head);321return;322}323324/*325* @rq should go through flush machinery. Mark it part of flush326* sequence and submit for further processing.327*/328memset(&rq->flush, 0, sizeof(rq->flush));329INIT_LIST_HEAD(&rq->flush.list);330rq->cmd_flags |= REQ_FLUSH_SEQ;331rq->end_io = flush_data_end_io;332333blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);334}335336/**337* blk_abort_flushes - @q is being aborted, abort flush requests338* @q: request_queue being aborted339*340* To be called from elv_abort_queue(). @q is being aborted. Prepare all341* FLUSH/FUA requests for abortion.342*343* CONTEXT:344* spin_lock_irq(q->queue_lock)345*/346void blk_abort_flushes(struct request_queue *q)347{348struct request *rq, *n;349int i;350351/*352* Requests in flight for data are already owned by the dispatch353* queue or the device driver. Just restore for normal completion.354*/355list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {356list_del_init(&rq->flush.list);357blk_flush_restore_request(rq);358}359360/*361* We need to give away requests on flush queues. Restore for362* normal completion and put them on the dispatch queue.363*/364for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {365list_for_each_entry_safe(rq, n, &q->flush_queue[i],366flush.list) {367list_del_init(&rq->flush.list);368blk_flush_restore_request(rq);369list_add_tail(&rq->queuelist, &q->queue_head);370}371}372}373374static void bio_end_flush(struct bio *bio, int err)375{376if (err)377clear_bit(BIO_UPTODATE, &bio->bi_flags);378if (bio->bi_private)379complete(bio->bi_private);380bio_put(bio);381}382383/**384* blkdev_issue_flush - queue a flush385* @bdev: blockdev to issue flush for386* @gfp_mask: memory allocation flags (for bio_alloc)387* @error_sector: error sector388*389* Description:390* Issue a flush for the block device in question. Caller can supply391* room for storing the error offset in case of a flush error, if they392* wish to. If WAIT flag is not passed then caller may check only what393* request was pushed in some internal queue for later handling.394*/395int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,396sector_t *error_sector)397{398DECLARE_COMPLETION_ONSTACK(wait);399struct request_queue *q;400struct bio *bio;401int ret = 0;402403if (bdev->bd_disk == NULL)404return -ENXIO;405406q = bdev_get_queue(bdev);407if (!q)408return -ENXIO;409410/*411* some block devices may not have their queue correctly set up here412* (e.g. loop device without a backing file) and so issuing a flush413* here will panic. Ensure there is a request function before issuing414* the flush.415*/416if (!q->make_request_fn)417return -ENXIO;418419bio = bio_alloc(gfp_mask, 0);420bio->bi_end_io = bio_end_flush;421bio->bi_bdev = bdev;422bio->bi_private = &wait;423424bio_get(bio);425submit_bio(WRITE_FLUSH, bio);426wait_for_completion(&wait);427428/*429* The driver must store the error location in ->bi_sector, if430* it supports it. For non-stacked drivers, this should be431* copied from blk_rq_pos(rq).432*/433if (error_sector)434*error_sector = bio->bi_sector;435436if (!bio_flagged(bio, BIO_UPTODATE))437ret = -EIO;438439bio_put(bio);440return ret;441}442EXPORT_SYMBOL(blkdev_issue_flush);443444445